MyLibrary.bib


@article{field_scale-invariance_1993,
  title = {Scale-Invariance and Self-Similar ‘Wavelet’ Transforms: {{An}} Analysis of Natural Scenes and Mammalian Visual Systems},
  url = {https://www.researchgate.net/publication/266310150_Scale-invariance_and_self-similar_'wavelet'_transforms_An_analysis_of_natural_scenes_and_mammalian_visual_systems},
  shorttitle = {Scale-Invariance and Self-Similar ‘Wavelet’ Transforms},
  abstract = {The processing of spatial patterns by the mammalian visual system shows a number of similarities to the ‘wavelet transforms’ which have recently attracted considerable interest outside of the...},
  journaltitle = {ResearchGate},
  urldate = {2016-07-28},
  date = {1993},
  author = {Field, D. J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Field_Scale-invariance and self-similar ‘wavelet’ transforms.pdf;/Users/fergalcotter/Zotero/storage/SAPHZHU3/266310150_Scale-invariance_and_self-similar_'wavelet'_transforms_An_analysis_of_natural_scenes_.html},
  note = {00216}
}

@article{gu_recent_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1512.07108},
  primaryClass = {cs},
  title = {Recent {{Advances}} in {{Convolutional Neural Networks}}},
  url = {http://arxiv.org/abs/1512.07108},
  abstract = {In the last few years, deep learning has led to very good performance on a variety of problems, such as visual recognition, speech recognition and natural language processing. Among different types of deep neural networks, convolutional neural networks have been most extensively studied. Due to the lack of training data and computing power in early days, it is hard to train a large high-capacity convolutional neural network without overfitting. After the rapid growth in the amount of the annotated data and the recent improvements in the strengths of graphics processor units (GPUs), the research on convolutional neural networks has been emerged swiftly and achieved state-of-the-art results on various tasks. In this paper, we provide a broad survey of the recent advances in convolutional neural networks. Besides, we also introduce some applications of convolutional neural networks in computer vision.},
  urldate = {2016-08-09},
  date = {2015-12-22},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Gu, Jiuxiang and Wang, Zhenhua and Kuen, Jason and Ma, Lianyang and Shahroudy, Amir and Shuai, Bing and Liu, Ting and Wang, Xingxing and Wang, Gang},
  file = {/Users/fergalcotter/Dropbox/Papers/Gu et al_2015_Recent Advances in Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/72QQ5TWA/1512.html},
  note = {00002}
}

@incollection{plate_avoiding_2012,
  langid = {english},
  title = {Avoiding {{Roundoff Error}} in {{Backpropagating Derivatives}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_15},
  abstract = {One significant source of roundoff error in backpropagation networks is the calculation of derivatives of unit outputs with respect to their total inputs. The roundoff error can lead result in high relative error in derivatives, and in particular, derivatives being calculated to be zero when in fact they are small but non-zero. This roundoff error is easily avoided with a simple programming trick which has a small memory overhead (one or two extra floating point numbers per unit) and an insignificant computational overhead.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {225-230},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Plate, Tony},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Plate_2012_Avoiding Roundoff Error in Backpropagating Derivatives.pdf;/Users/fergalcotter/Zotero/storage/5VEV4NP7/978-3-642-35289-8_15.html},
  doi = {10.1007/978-3-642-35289-8_15},
  note = {00001}
}

@unpublished{kingsbury_visualisation_2015,
  venue = {{Adelaide University}},
  title = {Visualisation of {{Convolutional Networks}} and {{Multiscale Scatter}}-{{Nets}}},
  date = {2015-11},
  keywords = {Unread},
  author = {Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/DeconvNets&ScatterNetsTalk1.pdf},
  note = {00000}
}

@article{kawaguchi_deep_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1605.07110},
  primaryClass = {cs, math, stat},
  title = {Deep {{Learning}} without {{Poor Local Minima}}},
  url = {http://arxiv.org/abs/1605.07110},
  abstract = {In this paper, we prove a conjecture published in 1989 and also partially address an open problem announced at the Conference on Learning Theory (COLT) 2015. For an expected loss function of a deep nonlinear neural network, we prove the following statements under the independence assumption adopted from recent work: 1) the function is non-convex and non-concave, 2) every local minimum is a global minimum, 3) every critical point that is not a global minimum is a saddle point, and 4) the property of saddle points differs for shallow networks (with three layers) and deeper networks (with more than three layers). Moreover, we prove that the same four statements hold for deep linear neural networks with any depth, any widths and no unrealistic assumptions. As a result, we present an instance, for which we can answer to the following question: how difficult to directly train a deep model in theory? It is more difficult than the classical machine learning models (because of the non-convexity), but not too difficult (because of the nonexistence of poor local minima and the property of the saddle points). We note that even though we have advanced the theoretical foundations of deep learning, there is still a gap between theory and practice.},
  urldate = {2016-05-25},
  date = {2016-05-23},
  keywords = {Computer Science - Learning,Mathematics - Optimization and Control,Statistics - Machine Learning},
  author = {Kawaguchi, Kenji},
  file = {C:\\Users\\fbc23\\Google Drive\\Papers\\May 16\\Kawaguchi_2016_Deep Learning without Poor Local Minima.pdf;/Users/fergalcotter/Zotero/storage/8RAQN7N8/1605.html},
  note = {00000}
}

@inproceedings{nguyen_deep_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1412.1897},
  location = {{Boston, MA, USA}},
  title = {Deep {{Neural Networks}} Are {{Easily Fooled}}: {{High Confidence Predictions}} for {{Unrecognizable Images}}},
  url = {http://arxiv.org/abs/1412.1897},
  shorttitle = {Deep {{Neural Networks}} Are {{Easily Fooled}}},
  abstract = {Deep neural networks (DNNs) have recently been achieving state-of-the-art performance on a variety of pattern-recognition tasks, most notably visual classification problems. Given that DNNs are now able to classify objects in images with near-human-level performance, questions naturally arise as to what differences remain between computer and human vision. A recent study revealed that changing an image (e.g. of a lion) in a way imperceptible to humans can cause a DNN to label the image as something else entirely (e.g. mislabeling a lion a library). Here we show a related result: it is easy to produce images that are completely unrecognizable to humans, but that state-of-the-art DNNs believe to be recognizable objects with 99.99\% confidence (e.g. labeling with certainty that white noise static is a lion). Specifically, we take convolutional neural networks trained to perform well on either the ImageNet or MNIST datasets and then find images with evolutionary algorithms or gradient ascent that DNNs label with high confidence as belonging to each dataset class. It is possible to produce images totally unrecognizable to human eyes that DNNs believe with near certainty are familiar objects, which we call "fooling images" (more generally, fooling examples). Our results shed light on interesting differences between human vision and current DNNs, and raise questions about the generality of DNN computer vision.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2016-08-24},
  date = {2015-06},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Neural and Evolutionary Computing},
  author = {Nguyen, Anh and Yosinski, Jason and Clune, Jeff},
  file = {/Users/fergalcotter/Dropbox/Papers/Nguyen et al_2014_Deep Neural Networks are Easily Fooled.pdf;/Users/fergalcotter/Zotero/storage/2FUQVBDB/1412.html},
  note = {00164}
}

@article{magarey_motion_1998,
  title = {Motion Estimation Using a Complex-Valued Wavelet Transform},
  volume = {46},
  doi = {10.1109/78.668557},
  abstract = {This paper describes a new motion estimation algorithm that is potentially useful for both computer vision and video compression applications, It is hierarchical in structure, using a separable two-dimensional (2-D) discrete wavelet transform (DWT) on each frame to efficiently construct a multiresolution pyramid of subimages, The DWT is based on a complex-valued pair of four-tap FIR filters with Gabor-like characteristics. The resulting complex DWT (CDWT) effectively implements an analysis by an ensemble of Gabor-like filters with a variety of orientations and scales, The phase difference between the subband coefficients of each frame at a given subpel bears a predictable relation to a local translation in the region of the reference frame subtended by that subpel, That relation is used to estimate the displacement field at the coarsest scale of the multiresolution pyramid, Each estimate is accompanied by a directional confidence measure in the form of the parameters of a quadratic matching surface, The initial estimate field is progressively refined by a coarse-to-fine strategy in which finer scale information is appropriately incorporated at each stage, The accuracy, efficiency, and robustness of the new algorithm are demonstrated in comparison testing against hierarchical implementations of intensity gradient-based and fractional-precision block matching motion estimators.},
  number = {4},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {1998-04},
  pages = {1069-1084},
  author = {Magarey, J. and Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Magarey_Kingsbury_1998_Motion estimation using a complex-valued wavelet transform.pdf},
  note = {00228}
}

@incollection{yosinski_how_2014,
  title = {How Transferable Are Features in Deep Neural Networks?},
  url = {http://papers.nips.cc/paper/5347-how-transferable-are-features-in-deep-neural-networks.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 27},
  publisher = {{Curran Associates, Inc.}},
  urldate = {2016-07-15},
  date = {2014},
  pages = {3320--3328},
  keywords = {_tablet},
  author = {Yosinski, Jason and Clune, Jeff and Bengio, Yoshua and Lipson, Hod},
  editor = {Ghahramani, Z. and Welling, M. and Cortes, C. and Lawrence, N. D. and Weinberger, K. Q.},
  file = {/Users/fergalcotter/Dropbox/Papers/Yosinski et al_2014_How transferable are features in deep neural networks.pdf;/Users/fergalcotter/Zotero/storage/K87W6KT8/5347-how-transferable-are-features-in-deep-neural-networks.html},
  note = {00217}
}

@article{bruna_signal_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1311.4025},
  primaryClass = {stat},
  title = {Signal {{Recovery}} from {{Pooling Representations}}},
  url = {http://arxiv.org/abs/1311.4025},
  abstract = {In this work we compute lower Lipschitz bounds of \$\textbackslash{}ell\_p\$ pooling operators for \$p=1, 2, \textbackslash{}infty\$ as well as \$\textbackslash{}ell\_p\$ pooling operators preceded by half-rectification layers. These give sufficient conditions for the design of invertible neural network layers. Numerical experiments on MNIST and image patches confirm that pooling layers can be inverted with phase recovery algorithms. Moreover, the regularity of the inverse pooling, controlled by the lower Lipschitz constant, is empirically verified with a nearest neighbor regression.},
  urldate = {2016-02-01},
  date = {2013-11-16},
  keywords = {Statistics - Machine Learning,Unread},
  author = {Bruna, Joan and Szlam, Arthur and LeCun, Yann},
  file = {/Users/fergalcotter/Dropbox/Papers/Bruna et al_2013_Signal Recovery from Pooling Representations.pdf;/Users/fergalcotter/Zotero/storage/AHQCZXAD/1311.html},
  note = {00008}
}

@inproceedings{oyallon_deep_2015,
  location = {{Boston, MA, USA}},
  title = {Deep {{Roto}}-{{Translation Scattering}} for {{Object Classification}}},
  url = {http://www.cv-foundation.org/openaccess/content_cvpr_2015/html/Oyallon_Deep_Roto-Translation_Scattering_2015_CVPR_paper.html},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2016-03-01},
  date = {2015-06},
  pages = {2865-2873},
  author = {Oyallon, Edouard and Mallat, Stephane},
  file = {/Users/fergalcotter/Dropbox/Papers/Oyallon_Mallat_2015_Deep Roto-Translation Scattering for Object Classification.pdf;/Users/fergalcotter/Zotero/storage/IB99F9KA/Oyallon_Deep_Roto-Translation_Scattering_2015_CVPR_paper.html},
  note = {00011}
}

@misc{hinton_recognize_2006,
  title = {To {{Recognize Shapes}}, {{First Learn}} to {{Generate Images}}},
  date = {2006-10},
  keywords = {Unread},
  author = {Hinton, Geoff},
  file = {/Users/fergalcotter/Dropbox/Papers/Geoff Hinton_2006_To Recognize Shapes, First Learn to Generate Images.pdf},
  note = {00000}
}

@incollection{rumelhart_parallel_1986,
  location = {{Cambridge, MA, USA}},
  title = {Parallel {{Distributed Processing}}: {{Explorations}} in the {{Microstructure}} of {{Cognition}}, {{Vol}}. 1},
  isbn = {978-0-262-68053-0},
  url = {http://dl.acm.org/citation.cfm?id=104279.104293},
  shorttitle = {Parallel {{Distributed Processing}}},
  publisher = {{MIT Press}},
  urldate = {2016-08-24},
  date = {1986},
  pages = {318--362},
  author = {Rumelhart, D. E. and Hinton, G. E. and Williams, R. J.},
  editor = {Rumelhart, David E. and McClelland, James L. and PDP Research Group, CORPORATE},
  file = {/Users/fergalcotter/Dropbox/Papers/Rumelhart et al_1986_Parallel Distributed Processing.pdf},
  note = {00087}
}

@incollection{lawrence_neural_2012,
  langid = {english},
  title = {Neural {{Network Classification}} and {{Prior Class Probabilities}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_19},
  abstract = {A commonly encountered problem in MLP (multi-layer perceptron) classification problems is related to the prior probabilities of the individual classes – if the number of training examples that correspond to each class varies significantly between the classes, then it may be harder for the network to learn the rarer classes in some cases. Such practical experience does not match theoretical results which show that MLPs approximate Bayesian a posteriori probabilities (independent of the prior class probabilities). Our investigation of the problem shows that the difference between the theoretical and practical results lies with the assumptions made in the theory (accurate estimation of Bayesian a posteriori probabilities requires the network to be large enough, training to converge to a global minimum, infinite training data, and the a priori class probabilities of the test set to be correctly represented in the training set). Specifically, the problem can often be traced to the fact that efficient MLP training mechanisms lead to sub-optimal solutions for most practical problems. In this chapter, we demonstrate the problem, discuss possible methods for alleviating it, and introduce new heuristics which are shown to perform well on a sample ECG classification problem. The heuristics may also be used as a simple means of adjusting for unequal misclassification costs.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {295-309},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Lawrence, Steve and Burns, Ian and Back, Andrew and Tsoi, Ah Chung and Giles, C. Lee},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Lawrence et al_2012_Neural Network Classification and Prior Class Probabilities.pdf;/Users/fergalcotter/Zotero/storage/S29BKNUN/978-3-642-35289-8_19.html},
  doi = {10.1007/978-3-642-35289-8_19},
  note = {00117}
}

@article{kingma_adam:_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1412.6980},
  primaryClass = {cs},
  title = {Adam: {{A Method}} for {{Stochastic Optimization}}},
  url = {http://arxiv.org/abs/1412.6980},
  shorttitle = {Adam},
  abstract = {We introduce Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the infinity norm.},
  urldate = {2016-08-07},
  date = {2014-12-22},
  keywords = {Computer Science - Learning},
  author = {Kingma, Diederik and Ba, Jimmy},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingma_Ba_2014_Adam.pdf;/Users/fergalcotter/Zotero/storage/33XDZ7EF/1412.html}
}

@article{soatto_visual_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1411.7676},
  primaryClass = {cs},
  title = {Visual {{Representations}}: {{Defining Properties}} and {{Deep Approximations}}},
  url = {http://arxiv.org/abs/1411.7676},
  shorttitle = {Visual {{Representations}}},
  abstract = {Visual representations are defined in terms of minimal sufficient statistics of visual data, for a class of tasks, that are also invariant to nuisance variability. Minimal sufficiency guarantees that we can store a representation in lieu of raw data with smallest complexity and no performance loss on the task at hand. Invariance guarantees that the statistic is constant with respect to uninformative transformations of the data. We derive analytical expressions for such representations and show they are related to feature descriptors commonly used in computer vision, as well as to convolutional neural networks. This link highlights the assumptions and approximations tacitly assumed by these methods and explains empirical practices such as clamping, pooling and joint normalization.},
  urldate = {2016-08-06},
  date = {2014-11-27},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Soatto, Stefano and Chiuso, Alessandro},
  file = {/Users/fergalcotter/Dropbox/Papers/Soatto_Chiuso_2014_Visual Representations.pdf;/Users/fergalcotter/Zotero/storage/A9CR6RD2/1411.html},
  note = {00000}
}

@incollection{lecun_efficient_2012,
  langid = {english},
  title = {Efficient {{BackProp}}},
  isbn = {978-3-642-35288-1},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_3},
  abstract = {The convergence of back-propagation learning is analyzed so as to explain common phenomenon observed by practitioners. Many undesirable behaviors of backprop can be avoided with tricks that are rarely exposed in serious technical publications. This paper gives some of those tricks, and offers explanations of why they work. Many authors have suggested that second-order optimization methods are advantageous for neural net training. It is shown that most “classical” second-order methods are impractical for large neural networks. A few methods are proposed that do not have these limitations.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {9-48},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {LeCun, Yann A. and Bottou, Léon and Orr, Genevieve B. and Müller, Klaus-Robert},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/LeCun et al_2012_Efficient BackProp.pdf;/Users/fergalcotter/Zotero/storage/TPNIZZTW/978-3-642-35289-8_3.html},
  doi = {10.1007/978-3-642-35289-8_3},
  note = {01085}
}

@article{saxe_exact_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.6120},
  primaryClass = {cond-mat, q-bio, stat},
  title = {Exact Solutions to the Nonlinear Dynamics of Learning in Deep Linear Neural Networks},
  url = {http://arxiv.org/abs/1312.6120},
  abstract = {Despite the widespread practical success of deep learning methods, our theoretical understanding of the dynamics of learning in deep neural networks remains quite sparse. We attempt to bridge the gap between the theory and practice of deep learning by systematically analyzing learning dynamics for the restricted case of deep linear neural networks. Despite the linearity of their input-output map, such networks have nonlinear gradient descent dynamics on weights that change with the addition of each new hidden layer. We show that deep linear networks exhibit nonlinear learning phenomena similar to those seen in simulations of nonlinear networks, including long plateaus followed by rapid transitions to lower error solutions, and faster convergence from greedy unsupervised pretraining initial conditions than from random initial conditions. We provide an analytical description of these phenomena by finding new exact solutions to the nonlinear dynamics of deep learning. Our theoretical analysis also reveals the surprising finding that as the depth of a network approaches infinity, learning speed can nevertheless remain finite: for a special class of initial conditions on the weights, very deep networks incur only a finite, depth independent, delay in learning speed relative to shallow networks. We show that, under certain conditions on the training data, unsupervised pretraining can find this special class of initial conditions, while scaled random Gaussian initializations cannot. We further exhibit a new class of random orthogonal initial conditions on weights that, like unsupervised pre-training, enjoys depth independent learning times. We further show that these initial conditions also lead to faithful propagation of gradients even in deep nonlinear networks, as long as they operate in a special regime known as the edge of chaos.},
  urldate = {2016-08-09},
  date = {2013-12-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Condensed Matter - Disordered Systems and Neural Networks,Quantitative Biology - Neurons and Cognition,Statistics - Machine Learning},
  author = {Saxe, Andrew M. and McClelland, James L. and Ganguli, Surya},
  file = {/Users/fergalcotter/Dropbox/Papers/Saxe et al_2013_Exact solutions to the nonlinear dynamics of learning in deep linear neural.pdf;/Users/fergalcotter/Zotero/storage/BKC39HTK/1312.html},
  note = {00087}
}

@inproceedings{zhang_image_2009,
  location = {{Taipei, Taiwan}},
  title = {Image Deconvolution Using a {{Gaussian Scale Mixtures}} Model to Approximate the Wavelet Sparseness Constraint},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4959675},
  eventtitle = {2009 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}}, and {{Signal Processing}} ({{ICASSP}})},
  booktitle = {Proceedings of 2009 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}}, and {{Signal Processing}} ({{ICASSP}})},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2009-04},
  pages = {681--684},
  author = {Zhang, Yingsong and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhang_Kingsbury_2009_Image deconvolution using a Gaussian Scale Mixtures model to approximate the.pdf},
  note = {00009}
}

@report{radoslaw_mantiuk_-it-yourself_????,
  title = {Do-{{It}}-{{Yourself Eye Tracker}}: {{Low}}-{{Cost Pupil}}-{{Based Eye Tracker}} for {{Computer Graphics Applications}}},
  abstract = {Eye tracking technologies offer sophisticated methods for
capturing humans’ gaze direction but their popularity in multimedia and
computer graphics systems is still low. One of the main reasons for this
are the high cost of commercial eye trackers that comes to 25,000 euros.
Interestingly, this price seems to stem from the costs incurred in research
rather than the value of used hardware components. In this work we show
that an eye tracker of a satisfactory precision can be built in the budget
of 30 euros. In the paper detailed instruction on how to construct a low
cost pupil-based eye tracker and utilise open source software to control
its behaviour is presented. We test the accuracy of our eye tracker and reveal
that its precision is comparable to commercial video-based devices.
We give an example of application in which our eye tracker is used to
control the depth-of-field rendering in real time virtual environment.},
  institution = {{West Pomeranian University of Technology in Szczecin, Faculty of Computer Science}},
  author = {{Radoslaw Mantiuk} and {Michal Kowalik} and {Adam Nowosielski} and {Bartosz Bazyluk}},
  file = {/Users/fergalcotter/Dropbox/Papers/Radoslaw Mantiuk et al_Do-It-Yourself Eye Tracker.pdf},
  note = {00032}
}

@article{ghahramani_probabilistic_2015,
  langid = {english},
  title = {Probabilistic Machine Learning and Artificial Intelligence},
  volume = {521},
  issn = {0028-0836},
  url = {http://www.nature.com/nature/journal/v521/n7553/full/nature14541.html},
  doi = {10.1038/nature14541},
  abstract = {How can a machine learn from experience? Probabilistic modelling provides a framework for understanding what learning is, and has therefore emerged as one of the principal theoretical and practical approaches for designing machines that learn from data acquired through experience. The probabilistic framework, which describes how to represent and manipulate uncertainty about models and predictions, has a central role in scientific data analysis, machine learning, robotics, cognitive science and artificial intelligence. This Review provides an introduction to this framework, and discusses some of the state-of-the-art advances in the field, namely, probabilistic programming, Bayesian optimization, data compression and automatic model discovery.},
  number = {7553},
  journaltitle = {Nature},
  shortjournal = {Nature},
  urldate = {2016-10-25},
  date = {2015-05-28},
  pages = {452-459},
  keywords = {Computer science,Mathematics and computing,Neuroscience},
  author = {Ghahramani, Zoubin},
  file = {/Users/fergalcotter/Dropbox/Papers/Ghahramani_2015_Probabilistic machine learning and artificial intelligence.pdf;/Users/fergalcotter/Zotero/storage/2BDTM2GI/nature14541.html}
}

@inproceedings{shi_real-time_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1609.05158},
  location = {{Las Vegas, NV}},
  title = {Real-{{Time Single Image}} and {{Video Super}}-{{Resolution Using}} an {{Efficient Sub}}-{{Pixel Convolutional Neural Network}}},
  url = {http://arxiv.org/abs/1609.05158},
  abstract = {Recently, several models based on deep neural networks have achieved great success in terms of both reconstruction accuracy and computational performance for single image super-resolution. In these methods, the low resolution (LR) input image is upscaled to the high resolution (HR) space using a single filter, commonly bicubic interpolation, before reconstruction. This means that the super-resolution (SR) operation is performed in HR space. We demonstrate that this is sub-optimal and adds computational complexity. In this paper, we present the first convolutional neural network (CNN) capable of real-time SR of 1080p videos on a single K2 GPU. To achieve this, we propose a novel CNN architecture where the feature maps are extracted in the LR space. In addition, we introduce an efficient sub-pixel convolution layer which learns an array of upscaling filters to upscale the final LR feature maps into the HR output. By doing so, we effectively replace the handcrafted bicubic filter in the SR pipeline with more complex upscaling filters specifically trained for each feature map, whilst also reducing the computational complexity of the overall SR operation. We evaluate the proposed approach using images and videos from publicly available datasets and show that it performs significantly better (+0.15dB on Images and +0.39dB on Videos) and is an order of magnitude faster than previous CNN-based methods.},
  eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2016-10-18},
  date = {2016-09-16},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Statistics - Machine Learning},
  author = {Shi, Wenzhe and Caballero, Jose and Huszár, Ferenc and Totz, Johannes and Aitken, Andrew P. and Bishop, Rob and Rueckert, Daniel and Wang, Zehan},
  file = {/Users/fergalcotter/Dropbox/Papers/1609.07009.pdf;/Users/fergalcotter/Dropbox/Papers/Shi et al_2016_Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel.pdf;/Users/fergalcotter/Zotero/storage/E2HRVWZA/1609.html}
}

@article{sifre_rigid-motion_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1403.1687},
  primaryClass = {cs},
  title = {Rigid-{{Motion Scattering}} for {{Texture Classification}}},
  url = {http://arxiv.org/abs/1403.1687},
  abstract = {A rigid-motion scattering computes adaptive invariants along translations and rotations, with a deep convolutional network. Convolutions are calculated on the rigid-motion group, with wavelets defined on the translation and rotation variables. It preserves joint rotation and translation information, while providing global invariants at any desired scale. Texture classification is studied, through the characterization of stationary processes from a single realization. State-of-the-art results are obtained on multiple texture data bases, with important rotation and scaling variabilities.},
  urldate = {2015-11-30},
  date = {2014-03-07},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Unread},
  author = {Sifre, Laurent and Mallat, Stéphane},
  file = {/Users/fergalcotter/Dropbox/Papers/Sifre_Mallat_2014_Rigid-Motion Scattering for Texture Classification.pdf;/Users/fergalcotter/Zotero/storage/BGFJPMFX/1403.html},
  note = {00003}
}

@article{srivastava_training_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1507.06228},
  primaryClass = {cs},
  title = {Training {{Very Deep Networks}}},
  url = {http://arxiv.org/abs/1507.06228},
  abstract = {Theoretical and empirical evidence indicates that the depth of neural networks is crucial for their success. However, training becomes more difficult as depth increases, and training of very deep networks remains an open problem. Here we introduce a new architecture designed to overcome this. Our so-called highway networks allow unimpeded information flow across many layers on information highways. They are inspired by Long Short-Term Memory recurrent networks and use adaptive gating units to regulate the information flow. Even with hundreds of layers, highway networks can be trained directly through simple gradient descent. This enables the study of extremely deep and efficient architectures.},
  urldate = {2016-08-07},
  date = {2015-07-22},
  keywords = {68T01,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,G.1.6,I.2.6},
  author = {Srivastava, Rupesh Kumar and Greff, Klaus and Schmidhuber, Jürgen},
  file = {/Users/fergalcotter/Dropbox/Papers/Srivastava et al_2015_Training Very Deep Networks.pdf;/Users/fergalcotter/Zotero/storage/3CX7XUPJ/1507.html},
  note = {00064}
}

@article{hull_database_1994,
  title = {A Database for Handwritten Text Recognition Research},
  volume = {16},
  issn = {0162-8828},
  doi = {10.1109/34.291440},
  abstract = {An image database for handwritten text recognition research is described. Digital images of approximately 5000 city names, 5000 state names, 10000 ZIP Codes, and 50000 alphanumeric characters are included. Each image was scanned from mail in a working post office at 300 pixels/in in 8-bit gray scale on a high-quality flat bed digitizer. The data were unconstrained for the writer, style, and method of preparation. These characteristics help overcome the limitations of earlier databases that contained only isolated characters or were prepared in a laboratory setting under prescribed circumstances. Also, the database is divided into explicit training and testing sets to facilitate the sharing of results among researchers as well as performance comparisons},
  number = {5},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {1994-05},
  pages = {550-554},
  keywords = {8 bit,Character recognition,Cities and towns,Gray-scale,Handwriting recognition,Image databases,Performance analysis,Postal services,Testing,Text recognition,Writing,alphanumeric characters,digital images,flat bed digitizer,gray scale,handwritten text recognition,image database,performance comparisons,performance evaluation,style,visual databases,writer},
  author = {Hull, J. J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Hull_1994_A database for handwritten text recognition research.pdf;/Users/fergalcotter/Zotero/storage/PG7BZX6P/abs_all.html},
  note = {00905}
}

@inproceedings{kingsbury_design_2003,
  location = {{Barcelona, Spain}},
  title = {Design of {{Q}}-Shift Complex Wavelets for Image Processing Using Frequency Domain Energy Minimization},
  volume = {1},
  doi = {10.1109/ICIP.2003.1247137},
  abstract = {This paper proposes a new method of designing finite-support wavelet filters, based on minimization of energy in key parts of the frequency domain. In particular this technique is shown to be very effective for designing families of filters that are suitable for use in the shift-invariant dual-tree complex wavelet structure that has been developed by the author recently, and has been shown to be important for a range of image processing applications. The dual-tree structure requires most of the wavelet filters to have a well-controlled group delay, equivalent to one quarter of a sample period, in order to achieve optimal shift invariance. The proposed new design technique allows this requirement to be included along with the usual smoothness and perfect reconstruction properties to yield wavelet filters with a unique combination of features: linear phase, tight frame, compact spatial support, good frequency domain selectivity with low sidelobe levels, approximate shift invariance, and good directional selectivity in two or more dimensions.},
  eventtitle = {2003 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  booktitle = {2003 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  date = {2003-09},
  keywords = {Continuous wavelet transforms,Design engineering,Discrete wavelet transforms,Filters,Frequency domain analysis,frequency domain energy minimization,frequency domain selectivity,group delay,image processing,low sidelobe level,minimisation,Minimization methods,Multidimensional signal processing,optimal shift invariance,Propagation delay,Q-shift complex wavelet transform,shift-invariant dual-tree complex wavelet structure,Wavelet domain,wavelet filter,wavelet transforms},
  author = {Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_2003_Design of Q-shift complex wavelets for image processing using frequency domain.pdf;/Users/fergalcotter/Zotero/storage/JHCS4WSW/1247137.html},
  note = {00115}
}

@thesis{waldspurger_wavelet_2012,
  location = {{Paris, France}},
  title = {Wavelet Transform Modulus: Phase Retrieval and Scattering},
  abstract = {Automatically understanding the content of a natural signal, like a sound or an image, is in
general a difficult task. In their naive representation, signals are indeed complicated objects,
belonging to high-dimensional spaces. With a different representation, they can however be
easier to interpret.
This thesis considers a representation commonly used in these cases, in particular for the
analysis of audio signals: the modulus of the wavelet transform. To better understand the
behaviour of this operator, we study, from a theoretical as well as algorithmic point of view, the
corresponding inverse problem: the reconstruction of a signal from the modulus of its wavelet
transform.
This problem belongs to a wider class of inverse problems: phase retrieval problems. In a
first chapter, we describe a new algorithm, PhaseCut, which numerically solves a generic phase
retrieval problem. Like the similar algorithm PhaseLift, PhaseCut relies on a convex relaxation
of the phase retrieval problem, which happens to be of the same form as relaxations of the widely
studied problem MaxCut. We compare the performances of PhaseCut and PhaseLift, in terms
of precision and complexity.
In the next two chapters, we study the specific case of phase retrieval for the wavelet transform.
We show that any function with no negative frequencies is uniquely determined (up to
a global phase) by the modulus of its wavelet transform, but that the reconstruction from the
modulus is not stable to noise, for a strong notion of stability. However, we prove a local stability
property. We also present a new non-convex phase retrieval algorithm, which is specific to the
case of the wavelet transform, and we numerically study its performances.
Finally, in the last two chapters, we study a more sophisticated representation, built from
the modulus of the wavelet transform: the scattering transform. Our goal is to understand
which properties of a signal are characterized by its scattering transform. We first prove that
the energy of scattering coefficients of a signal, at a given order, is upper bounded by the energy
of the signal itself, convolved with a high-pass filter that depends on the order. We then study
a generalization of the scattering transform, for stationary processes. We show that, in finite
dimension, this generalized transform preserves the norm. In dimension one, we also show that
the generalized scattering coefficients of a process characterize the tail of its distribution.},
  pagetotal = {210},
  institution = {{École Normale Supérieure}},
  type = {PhD Thesis},
  date = {2012-11},
  keywords = {Unread},
  author = {Waldspurger, Irene},
  file = {/Users/fergalcotter/Dropbox/Papers/Waldspurger_2012_Wavelet transform modulus.pdf},
  note = {00000}
}

@article{selesnick_hilbert_2001,
  title = {Hilbert Transform Pairs of Wavelet Bases},
  volume = {8},
  issn = {1070-9908},
  doi = {10.1109/97.923042},
  abstract = {This paper considers the design of pairs of wavelet bases where the wavelets form a Hilbert transform pair. The derivation is based on the limit functions defined by the infinite product formula. It is found that the scaling filters should be offset from one another by a half sample. This gives an alternative derivation and explanation for the result by Kingsbury (1999), that the dual-tree DWT is (nearly) shift-invariant when the scaling filters satisfy the same offset.},
  number = {6},
  journaltitle = {IEEE Signal Processing Letters},
  date = {2001-06},
  pages = {170-173},
  keywords = {Delay,Discrete transforms,Discrete wavelet transforms,Fourier transforms,Hilbert transform pairs,Hilbert transforms,Transient analysis,Wavelet analysis,dual-tree DWT,filter bank,filtering theory,half sample,infinite product formula,limit functions,scaling filters,shift-invariant,signal processing,wavelet bases,wavelet transforms,Encoding},
  author = {Selesnick, I.W.},
  file = {/Users/fergalcotter/Dropbox/Papers/Wavelets and DTCWT/Selesnick_2001_Hilbert transform pairs of wavelet bases.pdf;/Users/fergalcotter/Zotero/storage/GW5DTFXB/Selesnick_2001_Hilbert transform pairs of wavelet bases.pdf;/Users/fergalcotter/Zotero/storage/XE22KAT7/abs_all.html},
  note = {00351}
}

@inproceedings{kingsbury_shift_1999,
  location = {{Phoenix, AZ, USA}},
  title = {Shift Invariant Properties of the Dual-Tree Complex Wavelet Transform},
  abstract = {We discuss the shift invariant properties of a new implementation of the Discrete Wavelet Transform, which employs a dual tree of wavelet filters to obtain the real and imaginary parts of complex: wavelet coefficients. This introduces limited redundancy (2(m):1 for m-dimensional signals) and allows the transform to provide approximate shift invariance and directionally selective filters (properties lacking in the traditional wavelet transform) while preserving the usual properties of perfect reconstruction and computational efficiency with good well-balanced frequency responses.},
  eventtitle = {1999 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}}, and {{Signal Processing}} ({{ICASSP}})},
  booktitle = {Proceedings of 1999 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}}, and {{Signal Processing}} ({{ICASSP}})},
  publisher = {{IEEE}},
  date = {1999},
  pages = {1221-1224},
  author = {Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_1999_Shift invariant properties of the dual-tree complex wavelet transform.pdf},
  note = {00234}
}

@article{gatys_neural_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1508.06576},
  primaryClass = {cs, q-bio},
  title = {A {{Neural Algorithm}} of {{Artistic Style}}},
  url = {http://arxiv.org/abs/1508.06576},
  abstract = {In fine art, especially painting, humans have mastered the skill to create unique visual experiences through composing a complex interplay between the content and style of an image. Thus far the algorithmic basis of this process is unknown and there exists no artificial system with similar capabilities. However, in other key areas of visual perception such as object and face recognition near-human performance was recently demonstrated by a class of biologically inspired vision models called Deep Neural Networks. Here we introduce an artificial system based on a Deep Neural Network that creates artistic images of high perceptual quality. The system uses neural representations to separate and recombine content and style of arbitrary images, providing a neural algorithm for the creation of artistic images. Moreover, in light of the striking similarities between performance-optimised artificial neural networks and biological vision, our work offers a path forward to an algorithmic understanding of how humans create and perceive artistic imagery.},
  urldate = {2016-02-02},
  date = {2015-08-26},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Neural and Evolutionary Computing,Quantitative Biology - Neurons and Cognition},
  author = {Gatys, Leon A. and Ecker, Alexander S. and Bethge, Matthias},
  file = {/Users/fergalcotter/Dropbox/Papers/Gatys et al_2015_A Neural Algorithm of Artistic Style.pdf;/Users/fergalcotter/Zotero/storage/I3FS73XK/1508.html},
  note = {00007}
}

@inproceedings{szegedy_going_2015,
  location = {{Boston, MA, USA}},
  title = {Going {{Deeper With Convolutions}}},
  url = {http://www.cv-foundation.org/openaccess/content_cvpr_2015/html/Szegedy_Going_Deeper_With_2015_CVPR_paper.html},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2015-11-29},
  date = {2015-06},
  pages = {1-9},
  keywords = {Key Paper,Unread},
  author = {Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},
  file = {/Users/fergalcotter/Dropbox/Papers/Szegedy et al_2015_Going Deeper With Convolutions.pdf;/Users/fergalcotter/Zotero/storage/E7T97XTB/Szegedy_Going_Deeper_With_2015_CVPR_paper.html},
  note = {00569}
}

@book{cormen_introduction_2009,
  langid = {english},
  location = {{Cambridge, Mass}},
  title = {Introduction to {{Algorithms}}, 3rd {{Edition}}},
  edition = {3rd edition},
  isbn = {978-0-262-03384-8},
  abstract = {Some books on algorithms are rigorous but incomplete; others cover masses                 of material but lack rigor. Introduction to Algorithms uniquely                 combines rigor and comprehensiveness. The book covers a broad range of algorithms in                 depth, yet makes their design and analysis accessible to all levels of readers. Each                 chapter is relatively self-contained and can be used as a unit of study. The                 algorithms are described in English and in a pseudocode designed to be readable by                 anyone who has done a little programming. The explanations have been kept elementary                 without sacrificing depth of coverage or mathematical rigor.The                 first edition became a widely used text in universities worldwide as well as the                 standard reference for professionals. The second edition featured new chapters on                 the role of algorithms, probabilistic analysis and randomized algorithms, and linear                 programming. The third edition has been revised and updated throughout. It includes                 two completely new chapters, on van Emde Boas trees and multithreaded algorithms,                 substantial additions to the chapter on recurrence (now called                 "Divide-and-Conquer"), and an appendix on matrices. It features improved                 treatment of dynamic programming and greedy algorithms and a new notion of                 edge-based flow in the material on flow networks. Many new exercises and problems                 have been added for this edition. As of the third edition, this textbook is                 published exclusively by the MIT Press.},
  pagetotal = {1312},
  publisher = {{The MIT Press}},
  date = {2009-07-31},
  author = {Cormen, Thomas and Leiserson, Charles and Rivest, Ronald and Stein, Clifford},
  file = {C:\\Users\\fbc23\\Google Drive\\Papers\\Books and Notes\\introduction-to-algorithms-3rd-edition.pdf},
  note = {00000}
}

@inproceedings{kingsbury_dual-tree_1998,
  location = {{Utah}},
  title = {The {{Dual}}-{{Tree Complex Wavelet Transform}}: {{A New Technique For Shift Invariance And Directional Filters}}},
  shorttitle = {The {{Dual}}-{{Tree Complex Wavelet Transform}}},
  abstract = {A new implementation of the Discrete Wavelet Transform is presented, suitable for a range of signal and image processing applications. It employs a dual tree of wavelet filters to obtain the real and imaginary parts of complex wavelet coefficients. This introduces limited redundancy (4:1 for 2-dimensional signals) and allows the transform to provide approximate shift invariance and directionally selective filters (properties lacking in the traditional wavelet transform) while preserving the usual properties of perfect reconstruction and computational efficiency. An application to texture synthesis is presented.  1. INTRODUCTION  Although the Discrete Wavelet Transform (DWT) in its maximally decimated form (Mallat's dyadic filter tree [1]) has established an impressive reputation as a tool for image compression, its use for other signal analysis and reconstruction tasks has been hampered by two main disadvantages:  ffl Lack of shift invariance, which means that small shifts in the input...},
  eventtitle = {1998 8th {{International Conference}} on {{Digital Signal Processing}} ({{DSP}})},
  booktitle = {1998 8th {{International Conference}} on {{Digital Signal Processing}} ({{DSP}})},
  publisher = {{IEEE}},
  date = {1998-08},
  pages = {319-322},
  author = {Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_1998_The Dual-Tree Complex Wavelet Transform.pdf;/Users/fergalcotter/Zotero/storage/ZXPH2C6I/summary.html},
  note = {00586}
}

@incollection{yaeger_combining_2012,
  langid = {english},
  title = {Combining {{Neural Networks}} and {{Context}}-{{Driven Search}} for {{On}}-Line, {{Printed Handwriting Recognition}} in the {{Newton}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_18},
  abstract = {While on-line handwriting recognition is an area of long-standing and ongoing research, the recent emergence of portable, pen-based computers has focused urgent attention on usable, practical solutions. We discuss a combination and improvement of classical methods to produce robust recognition of hand-printed English text, for a recognizer shipping in new models of Apple Computer’s Newton MessagePad® and eMate®. Combining an artificial neural network (ANN), as a character classifier, with a context-driven search over segmentation and word recognition hypotheses provides an effective recognition system. Long-standing issues relative to training, generalization, segmentation, models of context, probabilistic formalisms, etc., need to be resolved, however, to get excellent performance. We present a number of recent innovations in the application of ANNs as character classifiers for word recognition, including integrated multiple representations, normalized output error, negative training, stroke warping, frequency balancing, error emphasis, and quantized weights. User-adaptation and extension to cursive recognition pose continuing challenges.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {271-293},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Yaeger, Larry S. and Webb, Brandyn J. and Lyon, Richard F.},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Yaeger et al_2012_Combining Neural Networks and Context-Driven Search for On-line, Printed.pdf;/Users/fergalcotter/Zotero/storage/IZ5CECMA/978-3-642-35289-8_18.html},
  doi = {10.1007/978-3-642-35289-8_18},
  note = {00000}
}

@article{shuman_emerging_2013,
  title = {The Emerging Field of Signal Processing on Graphs: {{Extending}} High-Dimensional Data Analysis to Networks and Other Irregular Domains},
  volume = {30},
  issn = {1053-5888},
  doi = {10.1109/MSP.2012.2235192},
  shorttitle = {The Emerging Field of Signal Processing on Graphs},
  abstract = {In applications such as social, energy, transportation, sensor, and neuronal networks, high-dimensional data naturally reside on the vertices of weighted graphs. The emerging field of signal processing on graphs merges algebraic and spectral graph theoretic concepts with computational harmonic analysis to process such signals on graphs. In this tutorial overview, we outline the main challenges of the area, discuss different ways to define graph spectral domains, which are the analogs to the classical frequency domain, and highlight the importance of incorporating the irregular structures of graph data domains when processing signals on graphs. We then review methods to generalize fundamental operations such as filtering, translation, modulation, dilation, and downsampling to the graph setting and survey the localized, multiscale transforms that have been proposed to efficiently extract information from high-dimensional data on graphs. We conclude with a brief discussion of open issues and possible extensions.},
  number = {3},
  journaltitle = {IEEE Signal Processing Magazine},
  date = {2013-05},
  pages = {83-98},
  keywords = {Biological neural networks,Frequency domain analysis,Harmonic analysis,Spectral analysis,Tutorials,classical frequency domain,computational harmonic analysis,data analysis,data structures,graph spectral domains,graph theory,high-dimensional data analysis,high-dimensional graph data,information extraction,irregular graph data structures,open issues,signal processing,spectral graph theoretic concepts,weighted graphs,Feature extraction},
  author = {Shuman, D. I. and Narang, S. K. and Frossard, P. and Ortega, A. and Vandergheynst, P.},
  file = {/Users/fergalcotter/Dropbox/Papers/Shuman et al_2013_The emerging field of signal processing on graphs.pdf;/Users/fergalcotter/Zotero/storage/3FAS2I8W/6494675.html},
  note = {00414}
}

@article{bruna_invariant_2013,
  title = {Invariant {{Scattering Convolution Networks}}},
  volume = {35},
  issn = {0162-8828},
  doi = {10.1109/TPAMI.2012.230},
  abstract = {A wavelet scattering network computes a translation invariant image representation which is stable to deformations and preserves high-frequency information for classification. It cascades wavelet transform convolutions with nonlinear modulus and averaging operators. The first network layer outputs SIFT-type descriptors, whereas the next layers provide complementary invariant information that improves classification. The mathematical analysis of wavelet scattering networks explains important properties of deep convolution networks for classification. A scattering representation of stationary processes incorporates higher order moments and can thus discriminate textures having the same Fourier power spectrum. State-of-the-art classification results are obtained for handwritten digits and texture discrimination, with a Gaussian kernel SVM and a generative PCA classifier.},
  number = {8},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {2013-08},
  pages = {1872-1886},
  keywords = {Classification,Computer architecture,Convolution,Fourier power spectrum,Fourier transforms,Gaussian kernel SVM,Gaussian processes,Key Paper,SIFT-type descriptors,Scattering,Similar Work,Wavelet coefficients,averaging operators,complementary invariant information,convolution networks,deep convolution networks,deformations,generative PCA classifier,handwritten character recognition,handwritten digits,high-frequency information,image classification,image texture,invariant scattering convolution networks,invariants,mathematical analysis,network layer,nonlinear modulus,principal component analysis,scattering representation,state-of-the-art classification,stationary process,support vector machines,texture discrimination,translation invariant image representation,wavelet scattering network,wavelet transform convolutions,wavelet transforms,wavelets,Image representation},
  author = {Bruna, J. and Mallat, S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Bruna_Mallat_2013_Invariant Scattering Convolution Networks.pdf;/Users/fergalcotter/Zotero/storage/RCVIVNU4/articleDetails.html},
  note = {00150}
}

@inproceedings{lecun_convolutional_2010,
  title = {Convolutional Networks and Applications in Vision},
  doi = {10.1109/ISCAS.2010.5537907},
  abstract = {Intelligent tasks, such as visual perception, auditory perception, and language understanding require the construction of good internal representations of the world (or "features")? which must be invariant to irrelevant variations of the input while, preserving relevant information. A major question for Machine Learning is how to learn such good features automatically. Convolutional Networks (ConvNets) are a biologically-inspired trainable architecture that can learn invariant features. Each stage in a ConvNets is composed of a filter bank, some nonlinearities, and feature pooling layers. With multiple stages, a ConvNet can learn multi-level hierarchies of features. While ConvNets have been successfully deployed in many commercial applications from OCR to video surveillance, they require large amounts of labeled training samples. We describe new unsupervised learning algorithms, and new non-linear stages that allow ConvNets to be trained with very few labeled samples. Applications to visual object recognition and vision navigation for off-road mobile robots are described.},
  eventtitle = {Proceedings of 2010 {{IEEE International Symposium}} on {{Circuits}} and {{Systems}} ({{ISCAS}})},
  booktitle = {Proceedings of 2010 {{IEEE International Symposium}} on {{Circuits}} and {{Systems}} ({{ISCAS}})},
  date = {2010-05},
  pages = {253-256},
  keywords = {ConvNets,Key Paper,Learning systems,Navigation,Optical character recognition software,Unread,Video surveillance,Visual perception,_tablet,biologically-inspired architecture,convolutional networks,feature pooling layers,filter bank,intelligent tasks,internal representations,labeled training samples,machine learning,mobile robots,multilevel hierarchies,object recognition,off-road mobile robots,robot vision,unsupervised learning,vision navigation,visual object recognition},
  author = {LeCun, Y. and Kavukcuoglu, K. and Farabet, C.},
  file = {/Users/fergalcotter/Dropbox/Papers/LeCun et al_2010_Convolutional networks and applications in vision.pdf;/Users/fergalcotter/Zotero/storage/JM4MPRJC/abs_all.html},
  note = {00261}
}

@article{lake_human-level_2015,
  langid = {english},
  title = {Human-Level Concept Learning through Probabilistic Program Induction},
  volume = {350},
  issn = {0036-8075, 1095-9203},
  url = {http://science.sciencemag.org/content/350/6266/1332},
  doi = {10.1126/science.aab3050},
  abstract = {Handwritten characters drawn by a model
Not only do children learn effortlessly, they do so quickly and with a remarkable ability to use what they have learned as the raw material for creating new stuff. Lake et al. describe a computational model that learns in a similar fashion and does so better than current deep learning algorithms. The model classifies, parses, and recreates handwritten characters, and can generate new letters of the alphabet that look “right” as judged by Turing-like tests of the model's output in comparison to what real humans produce.
Science, this issue p. 1332
People learning new concepts can often generalize successfully from just a single example, yet machine learning algorithms typically require tens or hundreds of examples to perform with similar accuracy. People can also use learned concepts in richer ways than conventional algorithms—for action, imagination, and explanation. We present a computational model that captures these human learning abilities for a large class of simple visual concepts: handwritten characters from the world’s alphabets. The model represents concepts as simple programs that best explain observed examples under a Bayesian criterion. On a challenging one-shot classification task, the model achieves human-level performance while outperforming recent deep learning approaches. We also present several “visual Turing tests” probing the model’s creative generalization abilities, which in many cases are indistinguishable from human behavior.
Combining the capacity to handle noise with probabilistic learning yields humanlike performance in a computational model.
Combining the capacity to handle noise with probabilistic learning yields humanlike performance in a computational model.},
  number = {6266},
  journaltitle = {Science},
  urldate = {2016-02-25},
  date = {2015-12-11},
  pages = {1332-1338},
  keywords = {Read Now},
  author = {Lake, Brenden M. and Salakhutdinov, Ruslan and Tenenbaum, Joshua B.},
  file = {/Users/fergalcotter/Dropbox/Papers/Lake et al_2015_Human-level concept learning through probabilistic program induction.pdf;/Users/fergalcotter/Zotero/storage/J46EU8T2/1332.html},
  eprinttype = {pmid},
  eprint = {26659050},
  note = {00006}
}

@article{porat_localized_1989,
  title = {Localized Texture Processing in Vision: Analysis and Synthesis in the {{Gaborian}} Space},
  volume = {36},
  issn = {0018-9294},
  doi = {10.1109/10.16457},
  shorttitle = {Localized Texture Processing in Vision},
  abstract = {Recent studies of cortical simple cell function suggest that the primitives of image representation in vision have a wavelet form similar to Gabor elementary functions (EFs). It is shown that textures and fully textured images can be practically decomposed into, and synthesized from, a finite set of EFs. Textured-images can be synthesized from a set of EFs using an image coefficient library. Alternatively, texturing of contoured (cartoonlike) images is analogous to adding chromaticity information to contoured images. A method for texture discrimination and image segmentation using local features based on the Gabor approach is introduced. Features related to the EF's parameters provide efficient means for texture discrimination and classification. This method is invariant under rotation and translation. The performance of the classification appears to be robust with respect to noisy conditions. The results show the insensitivity of the discrimination to relatively high noise levels, comparable to the performances of the human observer.{$<>$}},
  number = {1},
  journaltitle = {IEEE Transactions on Biomedical Engineering},
  date = {1989-01},
  pages = {115-129},
  keywords = {Artificial Intelligence,Bandwidth,Computer Simulation,Depth Perception,Frequency,Gaborian space,Humans,Image Processing; Computer-Assisted,Image edge detection,Image texture analysis,Libraries,Models; Neurological,Noise level,Visual perception,cartoonlike images,chromaticity information,contoured images,cortical simple cell function,image coefficient library,image representation primitives,image segmentation,localized texture processing,noise robustness,noisy conditions,texture discrimination,vision,visual analysis,visual processing,wavelet form,Image representation},
  author = {Porat, M. and Zeevi, Y. Y.},
  file = {/Users/fergalcotter/Dropbox/Papers/Porat_Zeevi_1989_Localized texture processing in vision.pdf;/Users/fergalcotter/Zotero/storage/2EMBC5J6/articleDetails.html},
  note = {00223}
}

@article{serre_robust_2007,
  title = {Robust {{Object Recognition}} with {{Cortex}}-{{Like Mechanisms}}},
  volume = {29},
  issn = {0162-8828},
  doi = {10.1109/TPAMI.2007.56},
  abstract = {We introduce a new general framework for the recognition of complex visual scenes, which is motivated by biology: We describe a hierarchical system that closely follows the organization of visual cortex and builds an increasingly complex and invariant feature representation by alternating between a template matching and a maximum pooling operation. We demonstrate the strength of the approach on a range of recognition tasks: From invariant single object recognition in clutter to multiclass categorization problems and complex scene understanding tasks that rely on the recognition of both shape-based as well as texture-based objects. Given the biological constraints that the system had to satisfy, the approach performs surprisingly well: It has the capability of learning from only a few training examples and competes with state-of-the-art systems. We also discuss the existence of a universal, redundant dictionary of features that could handle the recognition of most object categories. In addition to its relevance for computer vision, the success of this approach suggests a plausibility proof for a class of feedforward models of object recognition in cortex},
  number = {3},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {2007-03},
  pages = {411-426},
  keywords = {Algorithms,Artificial Intelligence,Biomimetics,Brain modeling,Computer Simulation,Face detection,Gabor filters,Humans,Image Enhancement,Image Interpretation; Computer-Assisted,Layout,Models; Biological,Neuroscience,Pattern Recognition; Automated,Pattern Recognition; Visual,Reproducibility of Results,Robustness,Sensitivity and Specificity,Streaming media,Unread,complex visual scenes,computer vision,cortex-like mechanisms,image matching,model,multiclass categorization,neural network.,object recognition,robust object recognition,scene understanding,template matching,visual cortex},
  author = {Serre, T. and Wolf, L. and Bileschi, S. and Riesenhuber, M. and Poggio, T.},
  file = {/Users/fergalcotter/Dropbox/Papers/Serre et al_2007_Robust Object Recognition with Cortex-Like Mechanisms.pdf;/Users/fergalcotter/Zotero/storage/UB4KQ2DT/abs_all.html},
  note = {01215}
}

@article{lecun_deep_2015,
  langid = {english},
  title = {Deep Learning},
  volume = {521},
  issn = {0028-0836},
  url = {http://www.nature.com/nature/journal/v521/n7553/full/nature14539.html},
  doi = {10.1038/nature14539},
  abstract = {Deep learning allows computational models that are composed of multiple processing layers to learn representations of data with multiple levels of abstraction. These methods have dramatically improved the state-of-the-art in speech recognition, visual object recognition, object detection and many other domains such as drug discovery and genomics. Deep learning discovers intricate structure in large data sets by using the backpropagation algorithm to indicate how a machine should change its internal parameters that are used to compute the representation in each layer from the representation in the previous layer. Deep convolutional nets have brought about breakthroughs in processing images, video, speech and audio, whereas recurrent nets have shone light on sequential data such as text and speech.},
  number = {7553},
  journaltitle = {Nature},
  shortjournal = {Nature},
  urldate = {2015-11-19},
  date = {2015-05-28},
  pages = {436-444},
  keywords = {Computer science,Key Paper,Mathematics and computing},
  author = {LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
  file = {/Users/fergalcotter/Dropbox/Papers/Other Networks/LeCun et al_2015_Deep learning.pdf;/Users/fergalcotter/Zotero/storage/EDFAJTE5/nature14539.html},
  note = {00049}
}

@article{hinton_fast_2006,
  title = {A {{Fast Learning Algorithm}} for {{Deep Belief Nets}}},
  volume = {18},
  issn = {0899-7667},
  url = {http://dx.doi.org/10.1162/neco.2006.18.7.1527},
  doi = {10.1162/neco.2006.18.7.1527},
  abstract = {We show how to use "complementary priors" to eliminate the explaining-away effects that make inference difficult in densely connected belief nets that have many hidden layers. Using complementary priors, we derive a fast, greedy algorithm that can learn deep, directed belief networks one layer at a time, provided the top two layers form an undirected associative memory. The fast, greedy algorithm is used to initialize a slower learning procedure that fine-tunes the weights using a contrastive version of the wake-sleep algorithm. After fine-tuning, a network with three hidden layers forms a very good generative model of the joint distribution of handwritten digit images and their labels. This generative model gives better digit classification than the best discriminative learning algorithms. The low-dimensional manifolds on which the digits lie are modeled by long ravines in the free-energy landscape of the top-level associative memory, and it is easy to explore these ravines by using the directed connections to display what the associative memory has in mind.},
  number = {7},
  journaltitle = {Neural Comput.},
  urldate = {2016-07-28},
  date = {2006-07},
  pages = {1527--1554},
  author = {Hinton, Geoffrey E. and Osindero, Simon and Teh, Yee-Whye},
  file = {/Users/fergalcotter/Dropbox/Papers/Hinton et al_2006_A Fast Learning Algorithm for Deep Belief Nets.pdf},
  note = {04328}
}

@article{porter_robust_1997,
  title = {Robust Rotation-Invariant Texture Classification: Wavelet, {{Gabor}} Filter and {{GMRF}} Based Schemes},
  volume = {144},
  issn = {1350-245X},
  doi = {10.1049/ip-vis:19971182},
  shorttitle = {Robust Rotation-Invariant Texture Classification},
  abstract = {Three novel feature extraction schemes for texture classification are proposed. The schemes employ the wavelet transform, a circularly symmetric Gabor filter or a Gaussian Markov random field with a circular neighbour set to achieve rotation-invariant texture classification. The schemes are shown to give a high level of classification accuracy compared to most existing schemes, using both fewer features (four) and a smaller area of analysis (16×16). Furthermore, unlike most existing schemes, the proposed schemes are shown to be rotation invariant demonstrate a high level of robustness noise. The performances of the three schemes are compared, indicating that the wavelet-based approach is the most accurate, exhibits the best noise performance and has the lowest computational complexity},
  number = {3},
  journaltitle = {IEE Proceedings - Vision, Image and Signal Processing},
  date = {1997-06},
  pages = {180-188},
  keywords = {GMRF,Gaussian Markov random field,Gaussian processes,Markov processes,circular neighbour set,circularly symmetric Gabor filter,classification accuracy,computational complexity,filtering theory,image classification,image texture,noise,noise performance,noise robustness,random processes,robust rotation invariant texture classification,wavelet transform,wavelet transforms,Feature extraction},
  author = {Porter, R. and Canagarajah, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Porter_Canagarajah_1997_Robust rotation-invariant texture classification.pdf;/Users/fergalcotter/Zotero/storage/ASMUBX96/articleDetails.html},
  note = {00210}
}

@inproceedings{pickering_object_2011,
  location = {{London, UK}},
  title = {Object Search Using Wavelet-Based Polar Matching for Aerial Imagery},
  url = {http://digital-library.theiet.org/content/conferences/10.1049/ic.2011.0167},
  eventtitle = {Sensor {{Signal Processing}} for {{Defence}} ({{SSPD}})},
  booktitle = {Sensor {{Signal Processing}} for {{Defence}} ({{SSPD}})},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2011-09},
  author = {Pickering, Andy and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Pickering_Kingsbury_2011_Object search using wavelet-based polar matching for aerial imagery.pdf},
  note = {00000}
}

@article{simonyan_deep_2014,
  title = {Deep {{Inside Convolutional Networks}}: {{Visualising Image Classification Models}} and {{Saliency Maps}}},
  url = {http://arxiv.org/abs/1312.6034},
  shorttitle = {Deep {{Inside Convolutional Networks}}},
  abstract = {This paper addresses the visualisation of image classification models, learnt using deep Convolutional Networks (ConvNets). We consider two visualisation techniques, based on computing the gradient of the class score with respect to the input image. The first one generates an image, which maximises the class score [Erhan et al., 2009], thus visualising the notion of the class, captured by a ConvNet. The second technique computes a class saliency map, specific to a given image and class. We show that such maps can be employed for weakly supervised object segmentation using classification ConvNets. Finally, we establish the connection between the gradient-based ConvNet visualisation methods and deconvolutional networks [Zeiler et al., 2013].},
  journaltitle = {Proceedings of the International Conference on Learning Representations (ICLR)},
  urldate = {2015-12-01},
  date = {2014},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Unread,_tablet},
  author = {Simonyan, Karen and Vedaldi, Andrea and Zisserman, Andrew},
  file = {/Users/fergalcotter/Dropbox/Papers/Simonyan et al_2014_Deep Inside Convolutional Networks.pdf;/Users/fergalcotter/Zotero/storage/7GECRCNB/1312.html},
  note = {00092}
}

@incollection{larsen_adaptive_2012,
  langid = {english},
  title = {Adaptive {{Regularization}} in {{Neural Network Modeling}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_8},
  abstract = {In this paper we address the important problem of optimizing regularization parameters in neural network modeling. The suggested optimization scheme is an extended version of the recently presented algorithm [25]. The idea is to minimize an empirical estimate - like the cross-validation estimate - of the generalization error with respect to regularization parameters. This is done by employing a simple iterative gradient descent scheme using virtually no additional programming overhead compared to standard training. Experiments with feed-forward neural network models for time series prediction and classification tasks showed the viability and robustness of the algorithm. Moreover, we provided some simple theoretical examples in order to illustrate the potential and limitations of the proposed regularization framework.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {111-130},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Larsen, Jan and Svarer, Claus and Andersen, Lars Nonboe and Hansen, Lars Kai},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Larsen et al_2012_Adaptive Regularization in Neural Network Modeling.pdf;/Users/fergalcotter/Zotero/storage/EBC28MWN/978-3-642-35289-8_8.html},
  doi = {10.1007/978-3-642-35289-8_8},
  note = {00056}
}

@article{autor_why_2015,
  title = {Why {{Are There Still So Many Jobs}}? {{The History}} and {{Future}} of {{Workplace Automation}}},
  volume = {29},
  issn = {0895-3309},
  url = {https://www.aeaweb.org/articles?id=10.1257/jep.29.3.3},
  doi = {10.1257/jep.29.3.3},
  shorttitle = {Why {{Are There Still So Many Jobs}}?},
  abstract = {In this essay, I begin by identifying the reasons that automation has not wiped out a majority of jobs over the decades and centuries. 
Automation does indeed substitute for labor—as it is typically intended to do. 
However, automation also complements labor, raises output in ways that leads to higher demand for labor, and interacts with adjustments in labor supply. 
Journalists and even expert commentators tend to overstate the extent of machine substitution for human labor and ignore the strong complementarities between automation and labor that increase productivity, raise earnings, and augment demand for labor. 
Changes in technology do alter the types of jobs available and what those jobs pay. 
In the last few decades, one noticeable change has been a "polarization" of the labor market, in which wage gains went disproportionately to those at the top and at the bottom of the income and skill distribution, not to those in the middle; however, I also argue, this polarization is unlikely to continue very far into future. 
The final section of this paper reflects on how recent and future advances in artificial intelligence and robotics should shape our thinking about the likely trajectory of occupational change and employment growth. 
I argue that the interplay between machine and human comparative advantage allows computers to substitute for workers in performing routine, codifiable tasks while amplifying the comparative advantage of workers in supplying problem-solving skills, adaptability, and creativity.},
  number = {3},
  journaltitle = {Journal of Economic Perspectives},
  urldate = {2016-07-20},
  date = {2015-08},
  pages = {3-30},
  author = {Autor, David H.},
  file = {/Users/fergalcotter/Dropbox/Papers/Autor_2015_Why Are There Still So Many Jobs.pdf;/Users/fergalcotter/Zotero/storage/VAATQBII/articles.html},
  note = {00000}
}

@article{bovik_multichannel_1990,
  title = {Multichannel Texture Analysis Using Localized Spatial Filters},
  volume = {12},
  issn = {0162-8828},
  doi = {10.1109/34.41384},
  abstract = {A computational approach for analyzing visible textures is described. Textures are modeled as irradiance patterns containing a limited range of spatial frequencies, where mutually distinct textures differ significantly in their dominant characterizing frequencies. By encoding images into multiple narrow spatial frequency and orientation channels, the slowly varying channel envelopes (amplitude and phase) are used to segregate textural regions of different spatial frequency, orientation, or phase characteristics. Thus, an interpretation of image texture as a region code, or carrier of region information, is emphasized. The channel filters used, known as the two-dimensional Gabor functions, are useful for these purposes in several senses: they have tunable orientation and radial frequency bandwidths and tunable center frequencies, and they optimally achieve joint resolution in space and in spatial frequency. By comparing the channel amplitude responses, one can detect boundaries between textures. Locating large variations in the channel phase responses allows discontinuities in the texture phase to be detected. Examples are given of both types of texture processing using a variety of real and synthetic textures},
  number = {1},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {1990-01},
  pages = {55-73},
  keywords = {Demodulation,Frequency,Image analysis,Image texture analysis,Layout,Shape,Spatial filters,Surface texture,channel amplitude responses,discontinuities,filtering and prediction theory,image segmentation,image texture,irradiance patterns,joint resolution,localized spatial filters,multichannel texture analysis,pattern recognition,radial frequency bandwidths,region code,region information,tunable center frequencies,tunable orientation,two-dimensional Gabor functions,visible textures,Encoding},
  author = {Bovik, A. C. and Clark, M. and Geisler, W. S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Bovik et al_1990_Multichannel texture analysis using localized spatial filters.pdf;/Users/fergalcotter/Zotero/storage/U83GQ6MT/abs_all.html},
  note = {01720}
}

@inproceedings{glorot_understanding_2010,
  location = {{Sardinia, Italy}},
  title = {Understanding the Difficulty of Training Deep Feedforward Neural Networks},
  abstract = {Whereas before 2006 it appears that deep multilayer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence. 1 Deep Neural Networks Deep learning methods aim at learning feature hierarchies with features from higher levels of the hierarchy formed by the composition of lower level features. They include},
  eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
  booktitle = {Proceedings of the {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
  date = {2010-05},
  author = {Glorot, Xavier and Bengio, Yoshua},
  file = {/Users/fergalcotter/Dropbox/Papers/Glorot_Bengio_2010_Understanding the difficulty of training deep feedforward neural networks.pdf;/Users/fergalcotter/Zotero/storage/TQFGSP6G/summary.html},
  note = {00649}
}

@incollection{mairal_convolutional_2014-1,
  title = {Convolutional {{Kernel Networks}}},
  url = {http://papers.nips.cc/paper/5348-convolutional-kernel-networks.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 27},
  publisher = {{Curran Associates, Inc.}},
  urldate = {2016-02-01},
  date = {2014},
  pages = {2627--2635},
  keywords = {Unread},
  author = {Mairal, Julien and Koniusz, Piotr and Harchaoui, Zaid and Schmid, Cordelia},
  editor = {Ghahramani, Z. and Welling, M. and Cortes, C. and Lawrence, N. D. and Weinberger, K. Q.},
  file = {/Users/fergalcotter/Dropbox/Papers/Mairal et al_2014_Convolutional Kernel Networks.pdf;/Users/fergalcotter/Zotero/storage/6R3EB42S/5348-convolutional-kernel-networks.html},
  note = {00030}
}

@article{hyeonwoo_noh_learning_????,
  title = {Learning {{Deconvolution Network}} for {{Semantic Segmentation}}},
  author = {{Hyeonwoo Noh} and {Seunghoon Hong} and {Bohyung Han}},
  file = {/Users/fergalcotter/Dropbox/Papers/CNNs/1505.04366v1.pdf},
  note = {00073}
}

@inproceedings{kingsbury_rotation-invariant_2006,
  location = {{Florence, Italy}},
  title = {Rotation-Invariant Local Feature Matching with Complex Wavelets},
  url = {http://link.eng.cam.ac.uk/foswiki/pub/Main/NGK/Kingsbury_Eusipco06.pdf},
  eventtitle = {Proc. {{European Conference}} on {{Signal Processing}} ({{EUSIPCO}})},
  booktitle = {14th {{European Signal Processing Conference14th European Signal Processing Conference}}},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2006-09},
  pages = {901--904},
  keywords = {Useful},
  author = {Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_2006_Rotation-invariant local feature matching with complex wavelets.pdf},
  note = {00052}
}

@software{vedaldi_matconvnet_2016,
  title = {{{MatConvNet Software}}},
  language = {Matlab},
  url = {https://github.com/vlfeat/matconvnet/releases/tag/v1.0-beta20},
  abstract = {MatConvNet is an implementation of Convolutional Neural Networks (CNNs)
for MATLAB. The toolbox is designed with an emphasis on simplicity and flexibility.
It exposes the building blocks of CNNs as easy-to-use MATLAB functions, providing
routines for computing linear convolutions with filter banks, feature pooling, and many
more. In this manner, MatConvNet allows fast prototyping of new CNN architectures;
at the same time, it supports efficient computation on CPU and GPU allowing
to train complex models on large datasets such as ImageNet ILSVRC. This document
provides an overview of CNNs and how they are implemented in MatConvNet and
gives the technical details of each computational block in the toolbox.},
  version = {1.0 beta 20},
  date = {2016-05},
  author = {Vedaldi, Andrea and Lenc, Karel and Gupta, Ankush},
  file = {/Users/fergalcotter/Dropbox/Papers/Andrea Vedaldi et al_2016_MatConvNet.pdf},
  note = {00253}
}

@incollection{orr_speeding_2012,
  langid = {english},
  title = {Speeding {{Learning}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_2},
  abstract = {There are those who argue that developing fast algorithms is no longer necessary because computers have become so fast. However, we believe that the complexity of our algorithms and the size of our problems will always expand to consume all cycles available, regardless of the speed of ourmachines.Thus, there will never come a time when computational efficiency can or should be ignored. Besides, in the quest to find solutions faster, we also often find better and more stable solutions as well. This section is devoted to techniques for making the learning process in backpropagation (BP) faster and more efficient. It contains a single chapter based on a workshop by Leon Bottou and Yann LeCun. While many alternative learning systems have emerged since the time BP was first introduced, BP is still the most widely used learning algorithm.The reason for this is its simplicity, efficiency, and its general effectiveness on a wide range of problems. Even so, there are many pitfalls in applying it, which is where all these tricks enter.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {7-8},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Orr, Geneviève B. and Müller, Klaus-Robert},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Orr_Müller_2012_Speeding Learning.pdf;/Users/fergalcotter/Zotero/storage/7ARKTK9R/978-3-642-35289-8_2.html},
  doi = {10.1007/978-3-642-35289-8_2},
  note = {00000}
}

@incollection{prechelt_early_2012,
  langid = {english},
  title = {Early {{Stopping}} — {{But When}}?},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_5},
  abstract = {Validation can be used to detect when overfitting starts during supervised training of a neural network; training is then stopped before convergence to avoid the overfitting (“early stopping”). The exact criterion used for validation-based early stopping, however, is usually chosen in an ad-hoc fashion or training is stopped interactively. This trick describes how to select a stopping criterion in a systematic fashion; it is a trick for either speeding learning procedures or improving generalization, whichever is more important in the particular situation. An empirical investigation on multi-layer perceptrons shows that there exists a tradeoff between training time and generalization: From the given mix of 1296 training runs using different 12 problems and 24 different network architectures I conclude slower stopping criteria allow for small improvements in generalization (here: about 4\% on average), but cost much more training time (here: about factor 4 longer on average).},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {53-67},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Prechelt, Lutz},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Prechelt_2012_Early Stopping — But When.pdf;/Users/fergalcotter/Zotero/storage/WKZI47V7/978-3-642-35289-8_5.html},
  doi = {10.1007/978-3-642-35289-8_5},
  note = {00206}
}

@article{russakovsky_imagenet_2015,
  langid = {english},
  title = {{{ImageNet Large Scale Visual Recognition Challenge}}},
  volume = {115},
  issn = {0920-5691, 1573-1405},
  url = {http://link.springer.com/article/10.1007/s11263-015-0816-y},
  doi = {10.1007/s11263-015-0816-y},
  abstract = {The ImageNet Large Scale Visual Recognition Challenge is a benchmark in object category classification and detection on hundreds of object categories and millions of images. The challenge has been run annually from 2010 to present, attracting participation from more than fifty institutions. This paper describes the creation of this benchmark dataset and the advances in object recognition that have been possible as a result. We discuss the challenges of collecting large-scale ground truth annotation, highlight key breakthroughs in categorical object recognition, provide a detailed analysis of the current state of the field of large-scale image classification and object detection, and compare the state-of-the-art computer vision accuracy with human accuracy. We conclude with lessons learned in the 5 years of the challenge, and propose future directions and improvements.},
  number = {3},
  journaltitle = {International Journal of Computer Vision},
  shortjournal = {Int J Comput Vis},
  urldate = {2016-08-13},
  date = {2015-04-11},
  pages = {211-252},
  author = {Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and Berg, Alexander C. and Fei-Fei, Li},
  file = {/Users/fergalcotter/Dropbox/Papers/Russakovsky et al_2015_ImageNet Large Scale Visual Recognition Challenge.pdf;/Users/fergalcotter/Zotero/storage/IXN6Q53D/s11263-015-0816-y.html},
  note = {01188}
}

@article{carandini_linearity_1997,
  langid = {english},
  title = {Linearity and {{Normalization}} in {{Simple Cells}} of the {{Macaque Primary Visual Cortex}}},
  volume = {17},
  issn = {0270-6474, 1529-2401},
  url = {http://www.jneurosci.org/content/17/21/8621},
  abstract = {Simple cells in the primary visual cortex often appear to compute a weighted sum of the light intensity distribution of the visual stimuli that fall on their receptive fields. A linear model of these cells has the advantage of simplicity and captures a number of basic aspects of cell function. It, however, fails to account for important response nonlinearities, such as the decrease in response gain and latency observed at high contrasts and the effects of masking by stimuli that fail to elicit responses when presented alone. To account for these nonlinearities we have proposed a normalization model, which extends the linear model to include mutual shunting inhibition among a large number of cortical cells. Shunting inhibition is divisive, and its effect in the model is to normalize the linear responses by a measure of stimulus energy. To test this model we performed extracellular recordings of simple cells in the primary visual cortex of anesthetized macaques. We presented large stimulus sets consisting of (1) drifting gratings of various orientations and spatiotemporal frequencies; (2) plaids composed of two drifting gratings; and (3) gratings masked by full-screen spatiotemporal white noise. We derived expressions for the model predictions and fitted them to the physiological data. Our results support the normalization model, which accounts for both the linear and the nonlinear properties of the cells. An alternative model, in which the linear responses are subject to a compressive nonlinearity, did not perform nearly as well.},
  number = {21},
  journaltitle = {The Journal of Neuroscience},
  shortjournal = {J. Neurosci.},
  urldate = {2016-07-27},
  date = {1997-01-11},
  pages = {8621-8644},
  keywords = {contrast,gain control,masking,noise,nonlinearity,normalization,visual cortex},
  author = {Carandini, Matteo and Heeger, David J. and Movshon, J. Anthony},
  file = {/Users/fergalcotter/Dropbox/Papers/Carandini et al_1997_Linearity and Normalization in Simple Cells of the Macaque Primary Visual Cortex.pdf;/Users/fergalcotter/Zotero/storage/GI5ZS2GS/8621.html},
  eprinttype = {pmid},
  eprint = {9334433},
  note = {00741}
}

@article{grun_taxonomy_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1606.07757},
  primaryClass = {cs},
  title = {A {{Taxonomy}} and {{Library}} for {{Visualizing Learned Features}} in {{Convolutional Neural Networks}}},
  url = {http://arxiv.org/abs/1606.07757},
  abstract = {Over the last decade, Convolutional Neural Networks (CNN) saw a tremendous surge in performance. However, understanding what a network has learned still proves to be a challenging task. To remedy this unsatisfactory situation, a number of groups have recently proposed different methods to visualize the learned models. In this work we suggest a general taxonomy to classify and compare these methods, subdividing the literature into three main categories and providing researchers with a terminology to base their works on. Furthermore, we introduce the FeatureVis library for MatConvNet: an extendable, easy to use open source library for visualizing CNNs. It contains implementations from each of the three main classes of visualization methods and serves as a useful tool for an enhanced understanding of the features learned by intermediate layers, as well as for the analysis of why a network might fail for certain examples.},
  urldate = {2016-07-27},
  date = {2016-06-24},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Grun, Felix and Rupprecht, Christian and Navab, Nassir and Tombari, Federico},
  file = {/Users/fergalcotter/Dropbox/Papers/Grün et al_2016_A Taxonomy and Library for Visualizing Learned Features in Convolutional Neural.pdf;/Users/fergalcotter/Zotero/storage/ABMFD94A/1606.html},
  note = {00000}
}

@article{lecun_tutorial_2006,
  title = {A {{Tutorial}} on {{Energy Based Learning}}},
  url = {http://yann.lecun.com/exdb/publis/pdf/lecun-06.pdf},
  journaltitle = {Predicting Structured Data},
  urldate = {2016-05-06},
  date = {2006-08-19},
  author = {LeCun, Yann and Chopra, Sumit and Hadsell, Raia and Ranzato, Marc' Aurelio and Huang, Fu Jie},
  file = {/Users/fergalcotter/Dropbox/Papers/LeCun et al_2006_A Tutorial on Energy Based Learning.pdf;/Users/fergalcotter/Zotero/storage/N6SCVTS8/lecun-06.html},
  note = {00206}
}

@article{tang_deep_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1306.0239},
  primaryClass = {cs, stat},
  title = {Deep {{Learning}} Using {{Linear Support Vector Machines}}},
  url = {http://arxiv.org/abs/1306.0239},
  abstract = {Recently, fully-connected and convolutional neural networks have been trained to achieve state-of-the-art performance on a wide variety of tasks such as speech recognition, image classification, natural language processing, and bioinformatics. For classification tasks, most of these "deep learning" models employ the softmax activation function for prediction and minimize cross-entropy loss. In this paper, we demonstrate a small but consistent advantage of replacing the softmax layer with a linear support vector machine. Learning minimizes a margin-based loss instead of the cross-entropy loss. While there have been various combinations of neural nets and SVMs in prior art, our results using L2-SVMs show that by simply replacing softmax with linear SVMs gives significant gains on popular deep learning datasets MNIST, CIFAR-10, and the ICML 2013 Representation Learning Workshop's face expression recognition challenge.},
  urldate = {2016-05-05},
  date = {2013-06-02},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Tang, Yichuan},
  file = {C:\\Users\\fbc23\\Google Drive\\Papers\\CNNs\\Tang_2013_Deep Learning using Linear Support Vector Machines.pdf;/Users/fergalcotter/Zotero/storage/HC3BT85G/1306.html},
  note = {00065}
}

@software{krizhevsky_cuda_2014,
  title = {Cuda {{Convnet}} Software},
  url = {https://github.com/akrizhevsky/cuda-convnet2},
  date = {2014},
  author = {Krizhevsky, Alex},
  note = {00006}
}

@article{mallat_group_2012,
  langid = {english},
  title = {Group {{Invariant Scattering}}},
  volume = {65},
  issn = {1097-0312},
  url = {http://onlinelibrary.wiley.com/doi/10.1002/cpa.21413/abstract},
  doi = {10.1002/cpa.21413},
  abstract = {This paper constructs translation-invariant operators on
\$\textbackslash{}font\textbackslash{}open=msbm10 at 10pt\textbackslash{}def\textbackslash{}R\{\textbackslash{}hbox\{\textbackslash{}open R\}\}\{\textbackslash{}bf L\}\^2(\{\{\{\textbackslash{}R\}\}\}\^d)\$, which are Lipschitz-continuous to the action of diffeomorphisms. A scattering propagator is a path-ordered product of nonlinear and noncommuting operators, each of which computes the modulus of a wavelet transform. A local integration defines a windowed scattering transform, which is proved to be Lipschitz-continuous to the action of C2 diffeomorphisms. As the window size increases, it converges to a wavelet scattering transform that is translation invariant. Scattering coefficients also provide representations of stationary processes. Expected values depend upon high-order moments and can discriminate processes having the same power spectrum. Scattering operators are extended on L2(G), where G is a compact Lie group, and are invariant under the action of G. Combining a scattering on \$\textbackslash{}font\textbackslash{}open=msbm10 at 10pt\textbackslash{}def\textbackslash{}R\{\textbackslash{}hbox\{\textbackslash{}open R\}\}\{\textbackslash{}bf L\}\^2(\{\{\{\textbackslash{}R\}\}\}\^d)\$
and on L2(SO(d)) defines a translation- and rotation-invariant scattering on \$\textbackslash{}font\textbackslash{}open=msbm10 at 10pt\textbackslash{}def\textbackslash{}R\{\textbackslash{}hbox\{\textbackslash{}open R\}\}\{\textbackslash{}bf L\}\^2(\{\{\{\textbackslash{}R\}\}\}\^d)\$. © 2012 Wiley Periodicals, Inc.},
  number = {10},
  journaltitle = {Communications on Pure and Applied Mathematics},
  shortjournal = {Comm. Pure Appl. Math.},
  urldate = {2015-11-30},
  date = {2012-10-01},
  pages = {1331-1398},
  keywords = {Key Paper,Similar Work,Unread},
  author = {Mallat, Stéphane},
  file = {/Users/fergalcotter/Dropbox/Papers/Mallat - 2012 - Group Invariant Scattering.pdf;/Users/fergalcotter/Zotero/storage/EXEEC2VT/abstract.html},
  note = {00135}
}

@incollection{rippel_spectral_2015,
  title = {Spectral {{Representations}} for {{Convolutional Neural Networks}}},
  url = {http://papers.nips.cc/paper/5649-spectral-representations-for-convolutional-neural-networks.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 28},
  publisher = {{Curran Associates, Inc.}},
  urldate = {2016-02-01},
  date = {2015},
  pages = {2440--2448},
  keywords = {Similar Work},
  author = {Rippel, Oren and Snoek, Jasper and Adams, Ryan P},
  editor = {Cortes, C. and Lawrence, N. D. and Lee, D. D. and Sugiyama, M. and Garnett, R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rippel et al_2015_Spectral Representations for Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/E4SI9UTI/5649-hardness-of-parameter-estimation-in-graphical-models.html},
  note = {00002}
}

@incollection{_practical_????,
  title = {A {{Practical Guide}} to {{Training Restricted Boltzmann Machines}}},
  booktitle = {A {{Practical Guide}} to {{Training Restricted Boltzmann Machines}}},
  file = {/Users/fergalcotter/Dropbox/Papers/A Practical Guide to Training Restricted Boltzmann Machines.pdf},
  note = {00627}
}

@article{boscain_anthropomorphic_2010,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1006.3735},
  primaryClass = {math},
  title = {Anthropomorphic Image Reconstruction via Hypoelliptic Diffusion},
  url = {http://arxiv.org/abs/1006.3735},
  abstract = {In this paper we study a model of geometry of vision due to Petitot, Citti and Sarti. One of the main features of this model is that the primary visual cortex V1 lifts an image from \$R\^2\$ to the bundle of directions of the plane. Neurons are grouped into orientation columns, each of them corresponding to a point of this bundle. In this model a corrupted image is reconstructed by minimizing the energy necessary for the activation of the orientation columns corresponding to regions in which the image is corrupted. The minimization process intrinsically defines an hypoelliptic heat equation on the bundle of directions of the plane. In the original model, directions are considered both with and without orientation, giving rise respectively to a problem on the group of rototranslations of the plane SE(2) or on the projective tangent bundle of the plane \$PTR\^2\$. We provide a mathematical proof of several important facts for this model. We first prove that the model is mathematically consistent only if directions are considered without orientation. We then prove that the convolution of a \$L\^2(R\^2,R)\$ function (e.g. an image) with a 2-D Gaussian is generically a Morse function. This fact is important since the lift of Morse functions to \$PTR\^2\$ is defined on a smooth manifold. We then provide the explicit expression of the hypoelliptic heat kernel on \$PTR\^2\$ in terms of Mathieu functions. Finally, we present the main ideas of an algorithm which allows to perform image reconstruction on real non-academic images. A very interesting point is that this algorithm is massively parallelizable and needs no information on where the image is corrupted.},
  urldate = {2016-08-03},
  date = {2010-06-18},
  keywords = {Mathematics - Analysis of PDEs,Mathematics - Classical Analysis and ODEs,Mathematics - Optimization and Control},
  author = {Boscain, Ugo and Duplaix, Jean and Gauthier, Jean-Paul and Rossi, Francesco},
  file = {/Users/fergalcotter/Dropbox/Papers/Boscain et al_2010_Anthropomorphic image reconstruction via hypoelliptic diffusion.pdf;/Users/fergalcotter/Zotero/storage/WCWH9EH5/1006.html},
  note = {00032}
}

@article{su_-noising_2005,
  langid = {english},
  title = {De-{{Noising}} of {{ECG}} Signal Using Translation- Invariant Wavelet de-Noising Method with Improved Thresholding},
  volume = {6},
  issn = {1557-170X},
  doi = {10.1109/IEMBS.2005.1615845},
  abstract = {The electrocardiogram (ECG) signal may mix various kinds of noises while gathering and recording. Wavelet thresholding de-noising method based on discrete wavelet transform (DWT) proposed by Donoho et al. is often used in de-noising of ECG signal. According to the defects of Donoho's method in de-noising of ECG signal, this paper proposes an improved thresholding de-noising method based on Donoho's method. The advantage of the improved de-noising method is that it may not only remain the geometrical characteristics of the original ECG signal and keep the amplitudes of various ECG waveforms efficiently, but also suppress impulsive noise to some extent. Furthermore, the traditional wavelet thresholding de-noising method causes Pseudo-Gibbs phenomena in Q and S waves of ECG signal due to DWT. In order to suppress Pseudo-Gibbs phenomena in Q and S waves, a new de-noising method combining above improved thresholding with the translation-invariant wavelet transform is proposed in this paper. The experimental results indicate that the proposed methods in the paper are better than traditional wavelet thresholding de-noising methods in aspects of remaining geometrical characteristics of ECG signal and the signal-to-noise ratio (SNR).},
  journaltitle = {Conference proceedings: ... Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual Conference},
  shortjournal = {Conf Proc IEEE Eng Med Biol Soc},
  date = {2005},
  pages = {5946-5949},
  author = {Su, Li and Zhao, Guoliang},
  eprinttype = {pmid},
  eprint = {17281615},
  note = {00046}
}

@article{srivastava_modeling_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1309.6865},
  primaryClass = {cs, stat},
  title = {Modeling {{Documents}} with {{Deep Boltzmann Machines}}},
  url = {http://arxiv.org/abs/1309.6865},
  abstract = {We introduce a Deep Boltzmann Machine model suitable for modeling and extracting latent semantic representations from a large unstructured collection of documents. We overcome the apparent difficulty of training a DBM with judicious parameter tying. This parameter tying enables an efficient pretraining algorithm and a state initialization scheme that aids inference. The model can be trained just as efficiently as a standard Restricted Boltzmann Machine. Our experiments show that the model assigns better log probability to unseen data than the Replicated Softmax model. Features extracted from our model outperform LDA, Replicated Softmax, and DocNADE models on document retrieval and document classification tasks.},
  urldate = {2015-11-19},
  date = {2013-09-26},
  keywords = {Computer Science - Information Retrieval,Computer Science - Learning,Statistics - Machine Learning},
  author = {Srivastava, Nitish and Salakhutdinov, Ruslan R. and Hinton, Geoffrey E.},
  file = {/Users/fergalcotter/Dropbox/Papers/Srivastava et al_2013_Modeling Documents with Deep Boltzmann Machines.pdf;/Users/fergalcotter/Zotero/storage/37UD5UEZ/1309.html},
  note = {00045}
}

@article{kingsbury_image_1999,
  title = {Image Processing with Complex Wavelets},
  volume = {357},
  doi = {10.1098/rsta.1999.0447},
  abstract = {We first review how wavelets may be used for multi-resolution image processing, describing the filter-bank implementation of the discrete wavelet transform (DWT) and how it may be extended via separable filtering for processing images and other multi-dimensional signals. We then show that the condition for inversion of the DWT (perfect reconstruction) forces many commonly used wavelets to be similar in shape, and that this shape produces severe shift dependence (variation of DWT coefficient energy at any given scale with shift of the input signal). It is also shown that separable filtering with the DWT prevents the transform from providing directionally selective filters for diagonal image features. Complex wavelets can provide both shift invariance and good directional selectivity, with only modest increases in signal redundancy and computation load. However, development of a complex wavelet transform (CWT) with perfect reconstruction and good filter characteristics has proved difficult until recently. We now propose the dual-tree CWT as a solution to this problem, yielding a transform with attractive properties for a range of signal and image processing applications, including motion estimation, denoising, texture analysis and synthesis, and object segmentation.},
  number = {1760},
  journaltitle = {Philosophical Transactions of the Royal Society a-Mathematical Physical and Engineering Sciences},
  date = {1999-09-15},
  pages = {2543-2560},
  author = {Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_1999_Image processing with complex wavelets_2.pdf},
  note = {00870}
}

@incollection{anderson_directed_2005,
  langid = {english},
  title = {Directed {{Visual Attention}} and the {{Dynamic Control}} of {{Information Flow}}},
  isbn = {978-0-12-375731-9},
  url = {http://linkinghub.elsevier.com/retrieve/pii/B9780123757319500070},
  booktitle = {Neurobiology of {{Attention}}},
  publisher = {{Elsevier}},
  urldate = {2016-07-26},
  date = {2005},
  pages = {11-17},
  author = {Anderson, Charles H. and Van Essen, David C. and Olshausen, Bruno A.},
  file = {/Users/fergalcotter/Dropbox/Papers/Anderson et al_2005_Directed Visual Attention and the Dynamic Control of Information Flow.pdf},
  note = {00038}
}

@book{lichman_uci_2013,
  title = {{{UCI Machine Learning Repository}}},
  url = {http://archive.ics.uci.edu/ml},
  publisher = {{University of California, Irvine, School of Information and Computer Sciences}},
  date = {2013},
  author = {Lichman, M.},
  note = {01585}
}

@inproceedings{hamari_does_2014,
  title = {Does Gamification Work?–A Literature Review of Empirical Studies on Gamification},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6758978},
  shorttitle = {Does Gamification Work?},
  booktitle = {2014 47th {{Hawaii International Conference}} on {{System Sciences}}},
  publisher = {{IEEE}},
  urldate = {2016-06-19},
  date = {2014},
  pages = {3025--3034},
  author = {Hamari, Juho and Koivisto, Jonna and Sarsa, Harri},
  file = {/Users/fergalcotter/Zotero/storage/5DBNHSW4/abs_all.html},
  note = {00535}
}

@article{nelson_enhanced_2011,
  title = {Enhanced {{Shift}} and {{Scale Tolerance}} for {{Rotation Invariant Polar Matching With Dual}}-{{Tree Wavelets}}},
  volume = {20},
  issn = {1057-7149},
  doi = {10.1109/TIP.2010.2069711},
  abstract = {Polar matching is a recently developed shift and rotation invariant object detection method that is based upon dual-tree complex wavelet transforms or equivalent multiscale directional filterbanks. It can be used to facilitate both keypoint matching, neighborhood search detection, or detection and tracking with particle filters. The theory is extended here to incorporate an allowance for local spatial and dilation perturbations. With experiments, we demonstrate that the robustness of the polar matching method is strengthened at modest computational cost.},
  number = {3},
  journaltitle = {IEEE Transactions on Image Processing},
  date = {2011-03},
  pages = {814-821},
  keywords = {Correlation,Pixel,Robustness,Tin,channel bank filters,dual-tree complex wavelet transforms,dual-tree wavelets,equivalent multiscale directional filterbanks,keypoint matching,neighborhood search detection,object detection,particle filtering (numerical methods),particle filters,rotation invariant object detection,rotation invariant polar matching,scale tolerance,shift tolerance,wavelet transforms,Feature extraction},
  author = {Nelson, J.D.B. and Kingsbury, N.G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Enhanced shift and scale tolerance for rotation invariant polar matching with dual-tree wavelets.pdf;/Users/fergalcotter/Zotero/storage/5WIDUWN5/abs_all.html},
  note = {00007}
}

@article{chan_pcanet:_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1404.3606},
  title = {{{PCANet}}: {{A Simple Deep Learning Baseline}} for {{Image Classification}}?},
  volume = {24},
  issn = {1057-7149, 1941-0042},
  url = {http://arxiv.org/abs/1404.3606},
  doi = {10.1109/TIP.2015.2475625},
  shorttitle = {{{PCANet}}},
  abstract = {In this work, we propose a very simple deep learning network for image classification which comprises only the very basic data processing components: cascaded principal component analysis (PCA), binary hashing, and block-wise histograms. In the proposed architecture, PCA is employed to learn multistage filter banks. It is followed by simple binary hashing and block histograms for indexing and pooling. This architecture is thus named as a PCA network (PCANet) and can be designed and learned extremely easily and efficiently. For comparison and better understanding, we also introduce and study two simple variations to the PCANet, namely the RandNet and LDANet. They share the same topology of PCANet but their cascaded filters are either selected randomly or learned from LDA. We have tested these basic networks extensively on many benchmark visual datasets for different tasks, such as LFW for face verification, MultiPIE, Extended Yale B, AR, FERET datasets for face recognition, as well as MNIST for hand-written digits recognition. Surprisingly, for all tasks, such a seemingly naive PCANet model is on par with the state of the art features, either prefixed, highly hand-crafted or carefully learned (by DNNs). Even more surprisingly, it sets new records for many classification tasks in Extended Yale B, AR, FERET datasets, and MNIST variations. Additional experiments on other public datasets also demonstrate the potential of the PCANet serving as a simple but highly competitive baseline for texture classification and object recognition.},
  number = {12},
  journaltitle = {IEEE Transactions on Image Processing},
  urldate = {2016-02-01},
  date = {2015-12},
  pages = {5017-5032},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Similar Work,Unread},
  author = {Chan, Tsung-Han and Jia, Kui and Gao, Shenghua and Lu, Jiwen and Zeng, Zinan and Ma, Yi},
  file = {/Users/fergalcotter/Dropbox/Papers/Chan et al_2015_PCANet.pdf;/Users/fergalcotter/Zotero/storage/NB8TX77A/1404.html},
  note = {00057}
}

@incollection{ruderman_statistics_1994,
  title = {Statistics of {{Natural Images}}: {{Scaling}} in the {{Woods}}},
  url = {http://papers.nips.cc/paper/835-statistics-of-natural-images-scaling-in-the-woods.pdf},
  shorttitle = {Statistics of {{Natural Images}}},
  booktitle = {Advances in {{Neural Information Processing Systems}} 6},
  publisher = {{Morgan-Kaufmann}},
  urldate = {2016-10-25},
  date = {1994},
  pages = {551--558},
  author = {Ruderman, Daniel L. and Bialek, William},
  editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Ruderman_Bialek_1994_Statistics of Natural Images.pdf;/Users/fergalcotter/Zotero/storage/SHZZ7Q9R/835-statistics-of-natural-images-scaling-in-the-woods.html}
}

@unpublished{kingsbury_scattering_2015,
  venue = {{University of Adelaide}},
  title = {Scattering {{Convolution Networks}} and {{Dual}}-Tree {{Wavelets}}},
  date = {2015-10-02},
  keywords = {Unread},
  author = {Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/ScatterNetsTalk3.pdf},
  note = {00000}
}

@incollection{horn_large_2012,
  langid = {english},
  title = {Large {{Ensemble Averaging}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_9},
  abstract = {Averaging over many predictors leads to a reduction of the variance portion of the error. We present a method for evaluating the mean squared error of an infinite ensemble of predictors from finite (small size) ensemble information. We demonstrate it on ensembles of networks with different initial choices of synaptic weights. We find that the optimal stopping criterion for large ensembles occurs later in training time than for single networks. We test our method on the suspots data set and obtain excellent results.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {131-137},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Horn, David and Naftaly, Ury and Intrator, Nathan},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Horn et al_2012_Large Ensemble Averaging.pdf;/Users/fergalcotter/Zotero/storage/HUAIQZZG/978-3-642-35289-8_9.html},
  doi = {10.1007/978-3-642-35289-8_9},
  note = {00002}
}

@report{kingsbury_4f8_2015,
  title = {{{4F8 Image Coding Course}}},
  institution = {{University of Cambridge}},
  date = {2015},
  author = {Kingsbury, Nick},
  file = {/home/fbc23/Dropbox/Courses/Image Processing/4F8CODING.pdf},
  note = {00004}
}

@inproceedings{krizhevsky_imagenet_2012,
  location = {{Lake Tahoe, Nevada}},
  title = {{{ImageNet Classification}} with {{Deep Convolutional Neural Networks}}},
  url = {http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf},
  eventtitle = {{{NIPS}}},
  publisher = {{Curran Associates, Inc.}},
  urldate = {2015-11-29},
  date = {2012},
  pages = {1097--1105},
  keywords = {Key Paper},
  author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.},
  file = {/Users/fergalcotter/Dropbox/Papers/Krizhevsky et al_2012_ImageNet Classification with Deep Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/MMZSEFVF/4824-imagenet-classification-with-deep-convolutional-neural-networks.html},
  note = {03570}
}

@inproceedings{loo_motion-estimation-based_2001,
  title = {Motion-Estimation-Based Registration of Geometrically Distorted Images for Watermark Recovery},
  url = {http://proceedings.spiedigitallibrary.org/proceeding.aspx?articleid=905055},
  doi = {10.1117/12.435445},
  eventtitle = {The {{International Society}} for {{Optical Engineering}} ({{Proceedings}} of {{SPIE}})},
  booktitle = {Proceedings of {{SPIE}} - {{The International Society}} for {{Optical Engineering}}},
  urldate = {2016-08-05},
  date = {2001-08-01},
  pages = {606-617},
  author = {Loo, Patrick and Kingsbury, Nick G.},
  editor = {Wong, Ping W. and Delp III, Edward J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Loo_Kingsbury_2001_Motion-estimation-based registration of geometrically distorted images for.pdf},
  note = {00035}
}

@incollection{simard_transformation_2012,
  langid = {english},
  title = {Transformation {{Invariance}} in {{Pattern Recognition}} – {{Tangent Distance}} and {{Tangent Propagation}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_17},
  abstract = {In pattern recognition, statistical modeling, or regression, the amount of data is a critical factor affecting the performance. If the amount of data and computational resources are unlimited, even trivial algorithms will converge to the optimal solution. However, in the practical case, given limited data and other resources, satisfactory performance requires sophisticated methods to regularize the problem by introducing a priori knowledge. Invariance of the output with respect to certain transformations of the input is a typical example of such a priori knowledge. In this chapter, we introduce the concept of tangent vectors, which compactly represent the essence of these transformation invariances, and two classes of algorithms, “tangent distance” and “tangent propagation”, which make use of these invariances to improve performance.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {235-269},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Simard, Patrice Y. and LeCun, Yann A. and Denker, John S. and Victorri, Bernard},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Simard et al_2012_Transformation Invariance in Pattern Recognition – Tangent Distance and Tangent.pdf;/Users/fergalcotter/Zotero/storage/SD7IJ9VI/978-3-642-35289-8_17.html},
  doi = {10.1007/978-3-642-35289-8_17},
  note = {00279}
}

@inproceedings{sifre_combined_2012,
  title = {Combined Scattering for Rotation Invariant Texture Analysis},
  url = {http://core.ac.uk/download/pdf/23798328.pdf},
  booktitle = {European {{Symposium}} on {{Artificial Neural Networks}} ({{ESANN}}) 2012},
  urldate = {2016-07-31},
  date = {2012},
  author = {Sifre, Laurent and Mallat, Stéphane},
  file = {/Users/fergalcotter/Dropbox/Papers/Sifre_Mallat_2012_Combined scattering for rotation invariant texture analysis.pdf},
  note = {00032}
}

@inproceedings{tompson_efficient_2015,
  location = {{Boston, MA, USA}},
  title = {Efficient {{Object Localization Using Convolutional Networks}}},
  url = {http://arxiv.org/abs/1411.4280},
  abstract = {Recent state-of-the-art performance on human-body pose estimation has been achieved with Deep Convolutional Networks (ConvNets). Traditional ConvNet architectures include pooling and sub-sampling layers which reduce computational requirements, introduce invariance and prevent over-training. These benefits of pooling come at the cost of reduced localization accuracy. We introduce a novel architecture which includes an efficient `position refinement' model that is trained to estimate the joint offset location within a small region of the image. This refinement model is jointly trained in cascade with a state-of-the-art ConvNet model to achieve improved accuracy in human joint location estimation. We show that the variance of our detector approaches the variance of human annotations on the FLIC dataset and outperforms all existing approaches on the MPII-human-pose dataset.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2015-12-01},
  date = {2015-06},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Unread},
  author = {Tompson, Jonathan and Goroshin, Ross and Jain, Arjun and LeCun, Yann and Bregler, Christopher},
  file = {/Users/fergalcotter/Dropbox/Papers/Tompson et al_2014_Efficient Object Localization Using Convolutional Networks.pdf;/Users/fergalcotter/Zotero/storage/2BA2I69U/1411.html},
  note = {00009}
}

@inproceedings{hatipoglu_texture_1999,
  location = {{Manchester, UK}},
  title = {Texture Classification Using Dual-Tree Complex Wavelet Transform},
  abstract = {A new texture feature extraction method utilizing dual tree complex wavelet transform (DT-CWT) is introduced. The complex wavelet transform is a recently developed tool that uses a dual tree of wavelet filters to find the real and imaginary parts of complex wavelet coefficients (Ij. Approximate shift invariance, good directional selectivity, computational efficiency properties of DT-CWT make it a good candidate for representing the texture features. In this paper, we propose a method for efficiently using the properties of DT-CWT in finding the directional and spatial/frequency characteristics of the patterns and classifying different texture patterns in terms of these characteristics. Experimental results show that the proposed feature extraction and classification method is efficient in terms of computational speed and retrieval accuracy.},
  eventtitle = {Manchester, {{UK}}},
  booktitle = {Seventh {{International Conference}} on {{Image Processing}} and {{Its Applications}}},
  publisher = {{IEEE}},
  date = {1999-07},
  pages = {344-347},
  author = {Hatipoglu, S. and Mitra, S. K. and Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Texture classification using dual-tree complex wavelet transform.pdf},
  note = {00064}
}

@inproceedings{zhu_semi-supervised_2003,
  title = {Semi-{{Supervised Learning Using Gaussian Fields}} and {{Harmonic Functions}}},
  abstract = {An approach to semi-supervised learning is proposed  that is based on a Gaussian random field  model. Labeled and unlabeled data are represented  as vertices in a weighted graph, with  edge weights encoding the similarity between instances. The learning},
  booktitle = {In {{Icml}}},
  date = {2003},
  pages = {912--919},
  author = {Zhu, Xiaojin and Ghahramani, Zoubin and Lafferty, John},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhu et al_2003_Semi-Supervised Learning Using Gaussian Fields and Harmonic Functions.pdf;/Users/fergalcotter/Zotero/storage/GHJJ6GCK/summary.html}
}

@book{winkler_image_2003,
  location = {{Berlin, Heidelberg}},
  title = {Image {{Analysis}}, {{Random Fields}} and {{Markov Chain Monte Carlo Methods}}},
  isbn = {978-3-642-62911-2 978-3-642-55760-6},
  url = {http://link.springer.com/10.1007/978-3-642-55760-6},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-09-29},
  date = {2003},
  author = {Winkler, Gerhard},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/(Applications of Mathematics 27) Gerhard Winkler (auth.)-Image Analysis, Random Fields and Markov Chain Monte Carlo Methods_ A Mathematical Introduction-Springer-Verlag Berlin Heidelberg (2003).pdf},
  note = {01278}
}

@incollection{van_rooyen_learning_2015,
  title = {Learning with {{Symmetric Label Noise}}: {{The Importance}} of {{Being Unhinged}}},
  url = {http://papers.nips.cc/paper/5941-learning-with-symmetric-label-noise-the-importance-of-being-unhinged.pdf},
  shorttitle = {Learning with {{Symmetric Label Noise}}},
  booktitle = {Advances in {{Neural Information Processing Systems}} 28},
  publisher = {{Curran Associates, Inc.}},
  urldate = {2016-01-14},
  date = {2015},
  pages = {10--18},
  author = {van Rooyen, Brendan and Menon, Aditya and Williamson, Robert C},
  editor = {Cortes, C. and Lawrence, N. D. and Lee, D. D. and Sugiyama, M. and Garnett, R.},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Dropbox/Papers/Miscellaneous/van Rooyen et al_2015_Learning with Symmetric Label Noise.pdf;/Users/fergalcotter/Zotero/storage/WE47WVUD/5941-learning-with-symmetric-label-noise-the-importance-of-being-unhinged.html},
  note = {00003}
}

@software{sifre_scatnet_2013,
  location = {{École normale supérieure}},
  language = {Matlab},
  title = {{{ScatNet Software}}},
  url = {https://github.com/scatnet/scatnet/releases/tag/v0.2},
  version = {0.2},
  date = {2013-11},
  author = {Sifre, L. and Anden, J.},
  file = {/Users/fergalcotter/Dropbox/Papers/L. Sifre_J. Anden_2013_ScatNet.pdf},
  note = {00004}
}

@article{vedaldi_matconvnet_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1412.4564},
  primaryClass = {cs},
  title = {{{MatConvNet}} - {{Convolutional Neural Networks}} for {{MATLAB}}},
  url = {http://arxiv.org/abs/1412.4564},
  abstract = {MatConvNet is an implementation of Convolutional Neural Networks (CNNs) for MATLAB. The toolbox is designed with an emphasis on simplicity and flexibility. It exposes the building blocks of CNNs as easy-to-use MATLAB functions, providing routines for computing linear convolutions with filter banks, feature pooling, and many more. In this manner, MatConvNet allows fast prototyping of new CNN architectures; at the same time, it supports efficient computation on CPU and GPU allowing to train complex models on large datasets such as ImageNet ILSVRC. This document provides an overview of CNNs and how they are implemented in MatConvNet and gives the technical details of each computational block in the toolbox.},
  urldate = {2016-01-20},
  date = {2014-12-15},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Mathematical Software,Computer Science - Neural and Evolutionary Computing},
  author = {Vedaldi, Andrea and Lenc, Karel},
  file = {/Users/fergalcotter/Dropbox/Papers/Notes/matconvnet-manual.pdf;/Users/fergalcotter/Dropbox/Papers/Notes/Vedaldi_Lenc_2014_MatConvNet - Convolutional Neural Networks for MATLAB.pdf;/Users/fergalcotter/Zotero/storage/3U2WXPVC/1412.html},
  note = {00107}
}

@article{ren_object_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1504.06066},
  primaryClass = {cs},
  title = {Object {{Detection Networks}} on {{Convolutional Feature Maps}}},
  url = {http://arxiv.org/abs/1504.06066},
  abstract = {Most object detectors contain two important components: a feature extractor and an object classifier. The feature extractor has rapidly evolved with significant research efforts leading to better deep ConvNet architectures. The object classifier, however, has not received much attention and most state-of-the-art systems (like R-CNN) use simple multi-layer perceptrons. This paper demonstrates that carefully designing deep networks for object classification is just as important. We take inspiration from traditional object classifiers, such as DPM, and experiment with deep networks that have part-like filters and reason over latent variables. We discover that on pre-trained convolutional feature maps, even randomly initialized deep classifiers produce excellent results, while the improvement due to fine-tuning is secondary; on HOG features, deep classifiers outperform DPMs and produce the best HOG-only results without external data. We believe these findings provide new insight for developing object detection systems. Our framework, called Networks on Convolutional feature maps (NoC), achieves outstanding results on the PASCAL VOC 2007 (73.3\% mAP) and 2012 (68.8\% mAP) benchmarks.},
  urldate = {2015-11-29},
  date = {2015-04-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Unread,_tablet},
  author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Zhang, Xiangyu and Sun, Jian},
  file = {/Users/fergalcotter/Dropbox/Papers/Ren et al_2015_Object Detection Networks on Convolutional Feature Maps.pdf;/Users/fergalcotter/Zotero/storage/7PWKB2V4/1504.html},
  note = {00008}
}

@online{griffin_caltech-256_2007,
  title = {Caltech-256 {{Object Category Dataset}}},
  url = {http://resolver.caltech.edu/CaltechAUTHORS:CNS-TR-2007-001},
  abstract = {We introduce a challenging set of 256 object categories containing a total of 30607 images. The original Caltech-101 [1] was collected by choosing a set of object categories, downloading examples from Google Images and then manually screening out all images that did not fit the category. Caltech-256 is collected in a similar manner with several improvements: a) the number of categories is more than doubled, b) the minimum number of images in any category is increased from 31 to 80, c) artifacts due to image rotation are avoided and d) a new and larger clutter category is introduced for testing background rejection. We suggest several testing paradigms to measure classification performance, then benchmark the dataset using two simple metrics as well as a state-of-the-art spatial pyramid matching [2] algorithm. Finally we use the clutter category to train an interest detector which rejects uninformative background regions.},
  type = {Report or Paper},
  urldate = {2016-08-13},
  date = {2007-03-10},
  author = {Griffin, Gregory and Holub, Alex and Perona, Pietro},
  file = {/Users/fergalcotter/Dropbox/Papers/Griffin et al_2007_Caltech-256 Object Category Dataset.pdf;/Users/fergalcotter/Zotero/storage/HHHMUH6Z/7694.html},
  note = {01302}
}

@inproceedings{ribeiro_why_2016,
  location = {{New York, NY, USA}},
  title = {"{{Why Should I Trust You}}?": {{Explaining}} the {{Predictions}} of {{Any Classifier}}},
  volume = {1},
  abstract = {Despite widespread adoption, machine learning models remain mostly black boxes. Understanding the reasons behind predictions is, however, quite important in assessing trust, which is fundamental if one plans to take action based on a prediction, or when choosing whether to deploy a new model. Such understanding also provides insights into the model, which can be used to transform an untrustworthy model or prediction into a trustworthy one. In this work, we propose LIME, a novel explanation technique that explains the predictions of any classifier in an interpretable and faithful manner, by learning an interpretable model locally around the prediction. We also propose a method to explain models by presenting representative individual predictions and their explanations in a non-redundant way, framing the task as a submodular optimization problem. We demonstrate the flexibility of these methods by explaining different models for text (e.g. random forests) and image classification (e.g. neural networks). We show the utility of explanations via novel experiments, both simulated and with human subjects, on various scenarios that require trust: deciding if one should trust a prediction, choosing between models, improving an untrustworthy classifier, and identifying why a classifier should not be trusted.},
  eventtitle = {22nd {{ACM SIGKDD International Conference}} on {{Knowledge Discovery and Data Mining}}},
  booktitle = {Proceedings of the 22nd {{ACM SIGKDD International Conference}} on {{Knowledge Discovery and Data Mining}}},
  url = {http://arxiv.org/abs/1602.04938},
  shorttitle = {"{{Why Should I Trust You}}?},
  urldate = {2016-08-12},
  date = {2016-02-16},
  author = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos},
  file = {/Users/fergalcotter/Dropbox/Papers/Ribeiro et al_2016_Why Should I Trust You.pdf;/Users/fergalcotter/Zotero/storage/BFG66CKC/1602.html},
}

@thesis{yingsong_zhang_sparse_2011,
  location = {{Department of Engineering, Signal Processing}},
  title = {Sparse {{Reconstruction Algorithms}} and the {{Application}} in {{Image Processing}}},
  abstract = {This dissertation investigates the sparse-regularized linear inverse problem and its applications in image deconvolution, interpolation and denoising problems. Wavelets provide sparse representations of a wide range of natural images and data. For this reason, we are interested in applying the sparse regularization onto the wavelet coefficients of images, particularly large images and 3D datasets which presents se-
rious challenges to the design of algorithms for signal recovery. The conventional sparse-signal-recovery methods, have been traditionally based on greedy heuristics (e.g. matching-pursuit based methods) or convex relaxation of l0 (l1 minimization). Such methods become very computational expensive when the dimensionality of the problem is large. 

In this dissertation, we proposed two algorithms to perform the sparse-signal recovery efficiently and accurately on large images and 3D datasets. SAIWave is specially designed for deconvolution. It is a Subband-Adaptive and generalized version of the popular Iterative thresholding algorithm that takes different update steps and thresholds for each subband, which is shown to accelerate the convergence. The SAIWave algorithm runs in parallel, updating all of the subbands at the same time. We also give an algorithm for selecting the parameter for each subband that decides the update steps and thresholds. The other al-
gorithm is L0RL2 . L0RL2 is developed for the purpose of general sparse-signal recovery. We introduce a new penalty function, which has some useful geometric properties with regard to the continuation parameter ǫ. These properties are then utilized to develop algorithms (IRLS-ǫ and L0RL2 ). Both algorithms are shown to recover sparse signals with fewer measurements than the conventional methods, while being efficient and accurate. L0RL2 is then combined with the devel-
opment of SAIWave to incorporate typical signal structures (group and tree).

Finally, we consider two image applications of the L0RL2 algorithm. The first one is to recover an image to a higher resolution than the observation (super-resolution). The second proposes a simplified image model to work with the L0RL2 algorithm on image denoising problems. The proposed model adopts a hierarchical structure to describe the multi-scale properties of wavelet coefficients. This is able to model the non-Gaussian features of the wavelet coefficients’ marginal distributions. In our experiments, the denoising result based on the proposed model has the visual effect of improving the image sharpness.},
  pagetotal = {221},
  institution = {{University of Cambridge}},
  type = {PhD Thesis},
  date = {2011},
  author = {{Yingsong Zhang}},
  file = {/Users/fergalcotter/Dropbox/Papers/Yingsong Zhang_2011_Sparse Reconstruction Algorithms and the Application in Image Processing.pdf},
  note = {00000}
}

@article{dinh_fast_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1609.09481},
  primaryClass = {cs, stat},
  title = {Fast Learning Rates with Heavy-Tailed Losses},
  url = {http://arxiv.org/abs/1609.09481},
  abstract = {We study fast learning rates when the losses are not necessarily bounded and may have a distribution with heavy tails. To enable such analyses, we introduce two new conditions: (i) the envelope function \$\textbackslash{}sup\_\{f \textbackslash{}in \textbackslash{}mathcal\{F\}\}|\textbackslash{}ell \textbackslash{}circ f|\$, where \$\textbackslash{}ell\$ is the loss function and \$\textbackslash{}mathcal\{F\}\$ is the hypothesis class, exists and is \$L\^r\$-integrable, and (ii) \$\textbackslash{}ell\$ satisfies the multi-scale Bernstein's condition on \$\textbackslash{}mathcal\{F\}\$. Under these assumptions, we prove that learning rate faster than \$O(n\^\{-1/2\})\$ can be obtained and, depending on \$r\$ and the multi-scale Bernstein's powers, can be arbitrarily close to \$O(n\^\{-1\})\$. We then verify these assumptions and derive fast learning rates for the problem of vector quantization by \$k\$-means clustering with heavy-tailed distributions. The analyses enable us to obtain novel learning rates that extend and complement existing results in the literature from both theoretical and practical viewpoints.},
  urldate = {2016-09-30},
  date = {2016-09-29},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Dinh, Vu and Ho, Lam Si Tung and Nguyen, Duy and Nguyen, Binh T.},
  file = {/Users/fergalcotter/Dropbox/Papers/Dinh et al_2016_Fast learning rates with heavy-tailed losses.pdf;/Users/fergalcotter/Zotero/storage/92V76SG9/1609.html}
}

@article{chen_efficient_2012,
  title = {Efficient {{Registration}} of {{Nonrigid}} 3-{{D Bodies}}},
  volume = {21},
  issn = {1057-7149},
  doi = {10.1109/TIP.2011.2160958},
  abstract = {We present a novel method to perform an accurate registration of 3-D nonrigid bodies by using phase-shift properties of the dual-tree complex wavelet transform (DT-BBCWT). Since the phases of DT-BBCWT coefficients change approximately linearly with the amount of feature displacement in the spatial domain, motion can be estimated using the phase information from these coefficients. The motion estimation is performed iteratively: first by using coarser level complex coefficients to determine large motion components and then by employing finer level coefficients to refine the motion field. We use a parametric affine model to describe the motion, where the affine parameters are found locally by substituting into an optical flow model and by solving the resulting overdetermined set of equations. From the estimated affine parameters, the motion field between the sensed and the reference data sets can be generated, and the sensed data set then can be shifted and interpolated spatially to align with the reference datafeature displacement set.},
  number = {1},
  journaltitle = {IEEE Transactions on Image Processing},
  date = {2012-01},
  pages = {262-272},
  keywords = {3-D MRI,Algorithms,Dual-tree complex wavelet transform $(hboxDT-BBChboxWT)$,Equations,Image Enhancement,Image Interpretation; Computer-Assisted,Imaging; Three-Dimensional,Mathematical model,Pattern Recognition; Automated,Reproducibility of Results,Sensitivity and Specificity,Subtraction Technique,affine parameters,affine transforms,biomedical MRI,coarser level complex coefficients,dual-tree complex wavelet transform,feature displacement,finer level coefficients,image registration,image sequences,medical image processing,motion estimation,nonrigid 3-D bodies,optical flow,optical flow model,parametric affine model,phase-shift properties,registration,wavelet transforms,Feature extraction},
  author = {Chen, H. and Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Chen_Kingsbury_2012_Efficient Registration of Nonrigid 3-D Bodies.pdf;/Users/fergalcotter/Zotero/storage/CB3HRJSQ/abs_all.html},
  note = {00011}
}

@article{donahue_decaf:_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1310.1531},
  primaryClass = {cs},
  title = {{{DeCAF}}: {{A Deep Convolutional Activation Feature}} for {{Generic Visual Recognition}}},
  url = {http://arxiv.org/abs/1310.1531},
  shorttitle = {{{DeCAF}}},
  abstract = {We evaluate whether features extracted from the activation of a deep convolutional network trained in a fully supervised fashion on a large, fixed set of object recognition tasks can be re-purposed to novel generic tasks. Our generic tasks may differ significantly from the originally trained tasks and there may be insufficient labeled or unlabeled data to conventionally train or adapt a deep architecture to the new tasks. We investigate and visualize the semantic clustering of deep convolutional features with respect to a variety of such tasks, including scene recognition, domain adaptation, and fine-grained recognition challenges. We compare the efficacy of relying on various network levels to define a fixed feature, and report novel results that significantly outperform the state-of-the-art on several important vision challenges. We are releasing DeCAF, an open-source implementation of these deep convolutional activation features, along with all associated network parameters to enable vision researchers to be able to conduct experimentation with deep representations across a range of visual concept learning paradigms.},
  urldate = {2016-07-15},
  date = {2013-10-05},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,_tablet},
  author = {Donahue, Jeff and Jia, Yangqing and Vinyals, Oriol and Hoffman, Judy and Zhang, Ning and Tzeng, Eric and Darrell, Trevor},
  file = {/Users/fergalcotter/Dropbox/Papers/Donahue et al_2013_DeCAF.pdf;/Users/fergalcotter/Zotero/storage/UPDHAKFS/1310.html},
  note = {00798}
}

@inproceedings{rivaz_complex_1999,
  location = {{Kobe, Japan}},
  title = {Complex Wavelet Features for Fast Texture Image Retrieval},
  volume = {1},
  doi = {10.1109/ICIP.1999.821576},
  abstract = {Digital libraries and multimedia databases are being rapidly developed and efficient search algorithms must now be developed. Gabor features have been experimentally shown to be the most accurate but have the disadvantage of slow computation. This paper shows how a new complex wavelet transform can be used to approximate the Gabor features and derives a distance metric based on statistical hypothesis testing that gives a better performance than the usual metric. The new features are experimentally compared with both Gabor and standard wavelet techniques},
  eventtitle = {1999 {{International Conference}} on {{Image Processing}}, 1999. {{ICIP}} 99. {{Proceedings}}},
  booktitle = {1999 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  date = {1999-10},
  pages = {109-113 vol.1},
  keywords = {Bandwidth,complex wavelet features,complex wavelet transform,Data engineering,digital libraries,Discrete wavelet transforms,distance metric,efficient search algorithms,fast texture image retrieval,Frequency,Gabor features,Gabor filters,Humans,image retrieval,image texture,multimedia databases,Software libraries,statistical hypothesis testing,Testing,wavelet transforms},
  author = {de Rivaz, P. and Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rivaz_Kingsbury_1999_Complex wavelet features for fast texture image retrieval.pdf;/Users/fergalcotter/Zotero/storage/T8G3MK9A/abs_all.html},
  note = {00086}
}

@thesis{pierre_garrigues_sparse_2009,
  title = {Sparse {{Coding Models}} of {{Natural Images}}: {{Algorithms}} for {{Efficient Inference}} and {{Learning}} of {{Higher}}-{{Order Structure}} | {{EECS}} at {{UC Berkeley}}},
  url = {https://www2.eecs.berkeley.edu/Pubs/TechRpts/2009/EECS-2009-71.html},
  abstract = {The concept of sparsity is widely used in the signal processing, machine learning,
and statistics communities for model fitting and solving inverse problems. It is also
important in neuroscience as it is thought to underlie the neural representations used
in the brain. In this thesis, I derive new algorithms for learning higher-order structure
in sparse coding models of images, and I present an improved algorithm for inferring
sparse representations with sequential observations.
It has been shown that adapting a dictionary of basis functions to the statistics
of natural images so as to maximize sparsity in the coefficients results in a set of
dictionary elements whose spatial properties resemble those of primary visual cortex
receptive fields. The operation to compute the sparse coefficients can be implemented
via an `1-penalized least-square problem commonly referred to as Basis Pursuit Denoising
or Lasso. However, the resulting sparse coefficients still exhibit pronounced
statistical dependencies, thus violating the independence assumption of the sparse
coding model. I propose in this thesis two models that attempt to capture the dependencies
among the basis function coefficients. The first model includes a pairwise
coupling term in the prior over the coefficient activity states. When adapted to the statistics of natural images, the coupling terms converge to a solution involving
facilitatory and inhibitory interactions among neighboring basis functions. In the
second model, the prior is a mixture of Laplacian distributions, where the statistical
dependencies among the basis function coefficients are modeled through the scale
parameters. I show that I can leverage the efficient algorithms developed for Basis
Pursuit Denoising to derive improved inference algorithms with the Laplacian scale
mixture prior.
I also propose in this thesis a new algorithm, RecLasso, to solve the Lasso with
online observations. I introduce an optimization problem that allows us to compute an
homotopy from the current solution to the solution after observing a new data point.
I compare RecLasso to Lars and Coordinate Descent, and present an application
to compressive sensing with sequential observations. I also propose an algorithm to
automatically update the regularization parameter after observing a new data point.},
  institution = {{University of California, Berkeley}},
  urldate = {2016-07-28},
  date = {2009},
  author = {{Pierre Garrigues}},
  file = {/Users/fergalcotter/Dropbox/Papers/Sparse Coding Models of Natural Images.pdf;/Users/fergalcotter/Zotero/storage/UEP8FHMW/EECS-2009-71.html},
  note = {00000}
}

@incollection{orr_regularization_2012,
  langid = {english},
  title = {Regularization {{Techniques}} to {{Improve Generalization}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_4},
  abstract = {Preface Good tricks for regularization are extremely important for improving the generalization ability of neural networks. The first and most commonly used trick is early stopping, which was originally described in [11]. In its simplest version, the trick is as follows: Take an independent validation set, e.g. take out a part of the training set, and monitor the error on this set during training. The error on the training set will decrease, whereas the error on the validation set will first decrease and then increase. The early stopping point occurs where the error on the validation set is the lowest. It is here that the network weights provide the best generalization.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {49-51},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Orr, Geneviève B. and Müller, Klaus-Robert},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Orr_Müller_2012_Regularization Techniques to Improve Generalization.pdf;/Users/fergalcotter/Zotero/storage/VWVMQ5NA/978-3-642-35289-8_4.html},
  doi = {10.1007/978-3-642-35289-8_4},
  note = {00000}
}

@article{ioannou_training_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1511.06744},
  primaryClass = {cs},
  title = {Training {{CNNs}} with {{Low}}-{{Rank Filters}} for {{Efficient Image Classification}}},
  url = {http://arxiv.org/abs/1511.06744},
  abstract = {We propose a new method for creating computationally efficient convolutional neural networks (CNNs) by using low-rank representations of convolutional filters. Rather than approximating filters in previously-trained networks with more efficient versions, we learn a set of small basis filters from scratch; during training, the network learns to combine these basis filters into more complex filters that are discriminative for image classification. To train such networks, a novel weight initialization scheme is used. This allows effective initialization of connection weights in convolutional layers composed of groups of differently-shaped filters. We validate our approach by applying it to several existing CNN architectures and training these networks from scratch using the CIFAR, ILSVRC and MIT Places datasets. Our results show similar or higher accuracy than conventional CNNs with much less compute. Applying our method to an improved version of VGG-11 network using global max-pooling, we achieve comparable validation accuracy using 41\% less compute and only 24\% of the original VGG-11 model parameters; another variant of our method gives a 1 percentage point increase in accuracy over our improved VGG-11 model, giving a top-5 center-crop validation accuracy of 89.7\% while reducing computation by 16\% relative to the original VGG-11 model. Applying our method to the GoogLeNet architecture for ILSVRC, we achieved comparable accuracy with 26\% less compute and 41\% fewer model parameters. Applying our method to a near state-of-the-art network for CIFAR, we achieved comparable accuracy with 46\% less compute and 55\% fewer parameters.},
  urldate = {2015-11-26},
  date = {2015-11-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Ioannou, Yani and Robertson, Duncan and Shotton, Jamie and Cipolla, Roberto and Criminisi, Antonio},
  file = {/Users/fergalcotter/Dropbox/Papers/Ioannou et al. - 2015 - Training CNNs with Low-Rank Filters for Efficient .pdf;/Users/fergalcotter/Zotero/storage/2TNEDUCW/1511.html},
  note = {00000}
}

@report{komlos_has_2014,
  title = {Has {{Creative Destruction Become More Destructive}}?},
  url = {http://www.nber.org/papers/w20379},
  abstract = {Schumpeter's concept of creative destruction as the engine of capitalist development is well-known. However, that the destructive part of creative destruction is a social cost and therefore biases our estimate of the impact of the innovation on NNP and on welfare is hardly acknowledged, with the exception of Witt (1996). Admittedly, during the First and Second Industrial Revolutions the magnitude of the destructive component of innovation was probably small compared to the net value added to employment, NNP or to welfare. However, we conjecture that recently the new technologies are often creating products which are close substitutes for the ones they replace whose value depreciates substantially in the process of destruction. Consequently, the contribution of recent innovations to NNP is likely biased upward. This note calls for a research agenda to estimate innovations into their creative and destructive components in order to provide improved estimates of their contribution to NNP, welfare, and employment.},
  number = {20379},
  institution = {{National Bureau of Economic Research}},
  type = {Working Paper},
  urldate = {2016-07-20},
  date = {2014-08},
  author = {Komlos, John},
  note = {00002}
}

@inproceedings{ng_matching_2010,
  location = {{Hong Kong, China}},
  title = {Matching of Interest Point Groups with Pairwise Spatial Constraints},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5651903},
  eventtitle = {2010 17th {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  booktitle = {2010 17th {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2010-09},
  pages = {2693--2696},
  author = {Ng, Ee Sin and Kingsbury, Nick G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Ng_Kingsbury_2010_Matching of interest point groups with pairwise spatial constraints.pdf},
  note = {00016}
}

@article{daniely_toward_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.05897},
  primaryClass = {cs, stat},
  title = {Toward {{Deeper Understanding}} of {{Neural Networks}}: {{The Power}} of {{Initialization}} and a {{Dual View}} on {{Expressivity}}},
  url = {http://arxiv.org/abs/1602.05897},
  shorttitle = {Toward {{Deeper Understanding}} of {{Neural Networks}}},
  abstract = {We develop a general duality between neural networks and compositional kernels, striving towards a better understanding of deep learning. We show that initial representations generated by common random initializations are sufficiently rich to express all functions in the dual kernel space. Hence, though the training objective is hard to optimize in the worst case, the initial weights form a good starting point for optimization. Our dual view also reveals a pragmatic and aesthetic perspective of neural networks and underscores their expressive power.},
  urldate = {2016-03-02},
  date = {2016-02-18},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computational Complexity,Computer Science - Data Structures and Algorithms,Computer Science - Learning,Statistics - Machine Learning},
  author = {Daniely, Amit and Frostig, Roy and Singer, Yoram},
  file = {/Users/fergalcotter/Dropbox/Papers/Daniely et al_2016_Toward Deeper Understanding of Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/V5W2JMUW/1602.html},
  note = {00001}
}

@article{szegedy_inception-v4_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.07261},
  primaryClass = {cs},
  title = {Inception-v4, {{Inception}}-{{ResNet}} and the {{Impact}} of {{Residual Connections}} on {{Learning}}},
  url = {http://arxiv.org/abs/1602.07261},
  abstract = {Very deep convolutional networks have been central to the largest advances in image recognition performance in recent years. One example is the Inception architecture that has been shown to achieve very good performance at relatively low computational cost. Recently, the introduction of residual connections in conjunction with a more traditional architecture has yielded state-of-the-art performance in the 2015 ILSVRC challenge; its performance was similar to the latest generation Inception-v3 network. This raises the question of whether there are any benefit in combining the Inception architecture with residual connections. Here we give clear empirical evidence that training with residual connections accelerates the training of Inception networks significantly. There is also some evidence of residual Inception networks outperforming similarly expensive Inception networks without residual connections by a thin margin. We also present several new streamlined architectures for both residual and non-residual Inception networks. These variations improve the single-frame recognition performance on the ILSVRC 2012 classification task significantly. We further demonstrate how proper activation scaling stabilizes the training of very wide residual Inception networks. With an ensemble of three residual and one Inception-v4, we achieve 3.08 percent top-5 error on the test set of the ImageNet classification (CLS) challenge},
  urldate = {2016-02-24},
  date = {2016-02-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,_tablet},
  author = {Szegedy, Christian and Ioffe, Sergey and Vanhoucke, Vincent},
  file = {/Users/fergalcotter/Dropbox/Papers/Szegedy et al_2016_Inception-v4, Inception-ResNet and the Impact of Residual Connections on.pdf;/Users/fergalcotter/Zotero/storage/VMAEQ44C/1602.html},
  note = {00000}
}

@inproceedings{serre_object_2005,
  title = {Object Recognition with Features Inspired by Visual Cortex},
  volume = {2},
  doi = {10.1109/CVPR.2005.254},
  abstract = {We introduce a novel set of features for robust object recognition. Each element of this set is a complex feature obtained by combining position- and scale-tolerant edge-detectors over neighboring positions and multiple orientations. Our system's architecture is motivated by a quantitative model of visual cortex. We show that our approach exhibits excellent recognition performance and outperforms several state-of-the-art systems on a variety of image datasets including many different object categories. We also demonstrate that our system is able to learn from very few examples. The performance of the approach constitutes a suggestive plausibility proof for a class of feedforward models of object recognition in cortex.},
  eventtitle = {{{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}, 2005. {{CVPR}} 2005},
  booktitle = {{{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}}, 2005. {{CVPR}} 2005},
  date = {2005-06},
  pages = {994-1000 vol. 2},
  keywords = {Biology computing,Brain modeling,Face detection,Geometry,Image recognition,Robustness,Shape,Target recognition,Unread,edge detection,image dataset,object detection,object recognition,position-tolerant edge detector,scale-tolerant edge detector,visual cortex,Feature extraction},
  author = {Serre, T. and Wolf, L. and Poggio, T.},
  file = {/Users/fergalcotter/Dropbox/Papers/Serre et al_2005_Object recognition with features inspired by visual cortex.pdf;/Users/fergalcotter/Zotero/storage/4H5ZGJ77/abs_all.html},
  note = {00773}
}

@article{zagoruyko_wide_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1605.07146},
  primaryClass = {cs},
  title = {Wide {{Residual Networks}}},
  url = {http://arxiv.org/abs/1605.07146},
  abstract = {Deep residual networks were shown to be able to scale up to thousands of layers and still have improving performance. However, each fraction of a percent of improved accuracy costs nearly doubling the number of layers, and so training very deep residual networks has a problem of diminishing feature reuse, which makes these networks very slow to train. To tackle these problems, in this paper we conduct a detailed experimental study on the architecture of ResNet blocks, based on which we propose a novel architecture where we decrease depth and increase width of residual networks. We call the resulting network structures wide residual networks (WRNs) and show that these are far superior over their commonly used thin and very deep counterparts. For example, we demonstrate that even a simple 16-layer-deep wide residual network outperforms in accuracy and efficiency all previous deep residual networks, including thousand-layer-deep networks, achieving new state-of-the-art results on CIFAR-10, CIFAR-100 and SVHN.},
  urldate = {2016-08-09},
  date = {2016-05-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Zagoruyko, Sergey and Komodakis, Nikos},
  file = {/Users/fergalcotter/Dropbox/Papers/Zagoruyko_Komodakis_2016_Wide Residual Networks.pdf;/Users/fergalcotter/Zotero/storage/VHCVNTIF/1605.html},
  note = {00006}
}

@thesis{bruna_scattering_2013,
  title = {Scattering {{Representations}} for {{Recognition}}},
  url = {https://pastel.archives-ouvertes.fr/pastel-00905109},
  institution = {{Ecole Polytechnique X}},
  type = {Theses},
  urldate = {2016-02-01},
  date = {2013-02},
  keywords = {Classification,Similar Work,Unread,classifcation,invariance,multifractales,multifractals,ondelettes,recognition,reconnaissance,wavelets},
  author = {Bruna, Joan},
  file = {/Users/fergalcotter/Dropbox/Papers/Bruna_2013_Scattering Representations for Recognition.pdf;/Users/fergalcotter/Zotero/storage/UBPSK9TK/pastel-00905109.html},
  note = {00010 
Déposée Novembre 2012.}
}

@incollection{caruana_dozen_2012,
  langid = {english},
  title = {A {{Dozen Tricks}} with {{Multitask Learning}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_12},
  abstract = {Multitask Learning is an inductive transfer method that improves generalization accuracy on a main task by using the information contained in the training signals of other related tasks. It does this by learning the extra tasks in parallel with the main task while using a shared representation; what is learned for each task can help other tasks be learned better. This chapter describes a dozen opportunities for applying multitask learning in real problems. At the end of the chapter we also make several suggestions for how to get the most our of multitask learning on real-world problems.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {163-189},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Caruana, Rich},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Caruana_2012_A Dozen Tricks with Multitask Learning.pdf;/Users/fergalcotter/Zotero/storage/F4AAVGT9/978-3-642-35289-8_12.html},
  doi = {10.1007/978-3-642-35289-8_12},
  note = {00012}
}

@inproceedings{nair_rectified_2010,
  title = {Rectified {{Linear Units Improve Restricted Boltzmann Machines}}},
  url = {http://machinelearning.wustl.edu/mlpapers/papers/icml2010_NairH10},
  eventtitle = {Proceedings of the 27th {{International Conference}} on {{Machine Learning}} ({{ICML}}-10)},
  urldate = {2016-08-08},
  date = {2010},
  pages = {807-814},
  author = {Nair, Vinod and Hinton, Geoffrey E.},
  file = {/Users/fergalcotter/Dropbox/Papers/Nair_Hinton_2010_Rectified Linear Units Improve Restricted Boltzmann Machines.pdf;/Users/fergalcotter/Zotero/storage/RV6MAZ7D/icml2010_NairH10.html},
  note = {00861}
}

@inproceedings{simard_best_2003,
  title = {Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis},
  doi = {10.1109/ICDAR.2003.1227801},
  abstract = {Not Available},
  eventtitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}},
  booktitle = {Seventh {{International Conference}} on {{Document Analysis}} and {{Recognition}}, 2003. {{Proceedings}}},
  date = {2003-08},
  pages = {958-963},
  keywords = {Best practices,Concrete,Convolution,Handwriting recognition,Industrial training,Information processing,Neural networks,Performance analysis,Text analysis,support vector machines},
  author = {Simard, P. Y. and Steinkraus, D. and Platt, J. C.},
  file = {/Users/fergalcotter/Dropbox/Papers/Simard et al_2003_Best practices for convolutional neural networks applied to visual document.pdf;/Users/fergalcotter/Zotero/storage/JAFAVD6I/abs_all.html},
  note = {00716}
}

@article{olshausen_emergence_1996,
  langid = {english},
  title = {Emergence of Simple-Cell Receptive Field Properties by Learning a Sparse Code for Natural Images},
  volume = {381},
  url = {http://www.nature.com/nature/journal/v381/n6583/abs/381607a0.html},
  doi = {10.1038/381607a0},
  number = {6583},
  journaltitle = {Nature},
  shortjournal = {Nature},
  urldate = {2016-07-27},
  date = {1996-06-13},
  pages = {607-609},
  author = {Olshausen, Bruno A. and Field, David J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Olshausen_Field_1996_Emergence of simple-cell receptive field properties by learning a sparse code.pdf;/Users/fergalcotter/Zotero/storage/42QIZ9IX/381607a0.html},
  note = {04099}
}

@inproceedings{yingsong_zhang_restoration_????,
  title = {Restoration of Images and {{3D}} Data to Higher Resolution by Deconvolution with Sparsity Regularization - {{Semantic Scholar}}},
  url = {https://www.semanticscholar.org/paper/Restoration-of-images-and-3D-data-to-higher-Zhang-Kingsbury/af042e8dd846aa7a9d891419afcae4e3653bc4d7},
  abstract = {Image convolution is conventionally approximated by the LTI discrete model. It is well recognized that the higher the sampling rate, the better is the approximation. However sometimes images or 3D data are only available at a lower sampling rate due to physical constraints of the imaging system. In this paper, we model the under-sampled observation as the result of combining convolution and subsampling. Because the wavelet coefficients of piecewise smooth images tend to be sparse and well modelled by tree-like structures, we propose the L0 reweighted-L2 minimization (L 0 RL 2) algorithm to solve this problem. This promotes sparsity by minimizing the reweighted L2 norm, which approximates the L0 norm, and by enforcing a tree model through bivariate shrinkage. We test the algorithm on 3 examples: a simple ring, the cameraman image and a 3D microscope dataset; and show that good results can be obtained.},
  urldate = {2016-08-05},
  author = {{Yingsong Zhang} and {Nick Kingsbury}},
  file = {/Users/fergalcotter/Dropbox/Papers/Yingsong Zhang_Nick Kingsbury_Restoration of images and 3D data to higher resolution by deconvolution with.pdf;/Users/fergalcotter/Zotero/storage/8Q8SVWME/af042e8dd846aa7a9d891419afcae4e3653bc4d7.html},
  note = {00000}
}

@inproceedings{he_deep_2016,
  archivePrefix = {arXiv},
  % eprinttype = {arxiv},
  % eprint = {1512.03385},
  location = {{Las Vegas}},
  title = {Deep {{Residual Learning}} for {{Image Recognition}}},
  url = {http://arxiv.org/abs/1512.03385},
  doi = {10.1109/CVPR.2016.90},
  abstract = {Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57\% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28\% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC \& COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.},
  eventtitle = {2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2016 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2016-02-24},
  date = {2016-06},
  pages = {770-778},
  keywords = {_tablet,CIFAR-10,COCO object detection dataset,COCO segmentation,Complexity theory,Computer Science - Computer Vision and Pattern Recognition,deep residual learning,deep residual nets,deeper neural network training,Degradation,ILSVRC & COCO 2015 competitions,ILSVRC 2015 classification task,image classification,image recognition,Image recognition,Image segmentation,ImageNet dataset,ImageNet localization,ImageNet test set,learning (artificial intelligence),neural nets,Neural networks,object detection,residual function learning,residual nets,Training,VGG nets,visual recognition tasks,Visualization},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  file = {/Users/fergalcotter/Dropbox/Papers/He et al_2015_Deep Residual Learning for Image Recognition.pdf;/Users/fergalcotter/Dropbox/Papers/He et al_2015_Deep Residual Learning for Image Recognition2.pdf;/Users/fergalcotter/Dropbox/Papers/He et al_2016_Deep Residual Learning for Image Recognition.pdf;/Users/fergalcotter/Zotero/storage/PF7UUD34/1512.html;/Users/fergalcotter/Zotero/storage/SMMQ7KH4/1512.html;/Users/fergalcotter/Zotero/storage/WG9UBVPQ/7780459.html},
  note = {00013}
}

@report{scott_pakin_comprehensive_2015,
  title = {The {{Comprehensive Latex Symbol List}}},
  date = {2015-11-30},
  author = {{Scott Pakin}},
  file = {/Users/fergalcotter/Dropbox/Papers/Scott Pakin_2015_The Comprehensive Latex Symbol List.pdf},
  note = {00048}
}

@report{till_tantau_pgfplots_2013,
  title = {{{PGFplots}}},
  number = {Version 3.0.1a},
  date = {2013},
  author = {{Till Tantau}},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/pgfplots.pdf},
  note = {00000}
}

@inproceedings{hatipoglu_image_2000,
  location = {{Vancouver, BC, Canada}},
  title = {Image Texture Description Using Complex Wavelet Transform},
  abstract = {Texture description is important for image retrieval from various large image databases. A method for representing texture Information in images utilizing dual tree complex wavelet transform (DT-CWT) is introduced. Approximate shift invariance, good directional selectivity and computational efficiency properties of DT-CWT are effective in describing texture patterns. Perfect reconstruction property of DT-CWT enables us to store the image data using DT-CWT coefficients. Experiments are performed with Brodatz texture database [1] and nature images that have significant amount of texture information to investigate the efficiency of our texture description algorithm.},
  eventtitle = {2000 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  booktitle = {Proceedings 2000 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  date = {2000-09},
  author = {Hatipoglu, S. and Mitra, S. K. and Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Hatipoglu et al_2000_Image texture description using complex wavelet transform.pdf},
  note = {00025}
}

@inproceedings{glorot_deep_2011,
  location = {{Ft. Lauderdale, FL, USA}},
  title = {Deep {{Sparse Rectifier Neural Networks}}},
  url = {http://machinelearning.wustl.edu/mlpapers/papers/AISTATS2011_GlorotBB11},
  eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
  booktitle = {Proceedings of {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
  publisher = {{PMLR}},
  urldate = {2016-08-09},
  date = {2011-04},
  pages = {315-323},
  author = {Glorot, Xavier and Bordes, Antoine and Bengio, Yoshua},
  file = {/Users/fergalcotter/Dropbox/Papers/Glorot et al_2011_Deep Sparse Rectifier Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/UX324FKQ/AISTATS2011_GlorotBB11.html},
  note = {00526}
}

@report{krizhevsky_learning_2009,
  title = {Learning {{Multiple Layers}} of {{Features}} from {{Tiny Images}}},
  date = {2009-04},
  author = {Krizhevsky, Alex},
  file = {/Users/fergalcotter/Dropbox/Papers/Krizhevsky2009_Learning_Multiple_Layers_of_Features_from_Tiny_Images.pdf},
  note = {00757}
}

@article{ng_robust_2012,
  title = {Robust Pairwise Matching of Interest Points with Complex Wavelets},
  volume = {21},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6185676},
  number = {8},
  journaltitle = {IEEE Transactions on Image Processing},
  urldate = {2015-11-03},
  date = {2012},
  pages = {3429--3442},
  author = {Ng, Ee Sin and Kingsbury, Nick G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Ng_Kingsbury_2012_Robust pairwise matching of interest points with complex wavelets.pdf},
  note = {00006}
}

@inproceedings{bendale_multiscale_2010,
  location = {{Aberystwyth, Wales}},
  title = {Multiscale Keypoint Analysis Based on Complex Wavelets},
  url = {https://hal.archives-ouvertes.fr/hal-00565021/},
  eventtitle = {2010 {{British Machine Vision Conference}} ({{BMVC}})},
  booktitle = {Proceedings of the {{British Machine Vision Conference}}},
  publisher = {{BMVA Press}},
  urldate = {2015-11-03},
  date = {2010},
  pages = {49--1},
  author = {Bendale, Pashmina and Triggs, William and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Bendale et al_2010_Multiscale keypoint analysis based on complex wavelets.pdf},
  note = {00009}
}

@book{mallat_wavelet_1998,
  title = {A {{Wavelet Tour}} of {{Signal Processing}}},
  publisher = {{Academic Press}},
  date = {1998},
  author = {Mallat, Stephane},
  note = {17547}
}

@incollection{casazza_physical_2006,
  title = {A Physical Interpretation of Tight Frames},
  url = {http://link.springer.com/chapter/10.1007/0-8176-4504-7_4},
  booktitle = {Harmonic Analysis and Applications},
  publisher = {{Birkhäuser Boston}},
  urldate = {2016-01-14},
  date = {2006},
  pages = {51--76},
  keywords = {_tablet},
  author = {Casazza, Peter G. and Fickus, Matthew and Kovačević, Jelena and Leon, Manuel T. and Tremain, Janet C.},
  file = {/Users/fergalcotter/Dropbox/Papers/Casazza et al_2006_A physical interpretation of tight frames.pdf;/Users/fergalcotter/Zotero/storage/673VMDTF/0-8176-4504-7_4.html},
  note = {00095}
}

@article{unser_texture_1995,
  title = {Texture Classification and Segmentation Using Wavelet Frames},
  volume = {4},
  issn = {1057-7149},
  doi = {10.1109/83.469936},
  abstract = {This paper describes a new approach to the characterization of texture properties at multiple scales using the wavelet transform. The analysis uses an overcomplete wavelet decomposition, which yields a description that is translation invariant. It is shown that this representation constitutes a tight frame of l2 and that it has a fast iterative algorithm. A texture is characterized by a set of channel variances estimated at the output of the corresponding filter bank. Classification experiments with l2 Brodatz textures indicate that the discrete wavelet frame (DWF) approach is superior to a standard (critically sampled) wavelet transform feature extraction. These results also suggest that this approach should perform better than most traditional single resolution techniques (co-occurrences, local linear transform, and the like). A detailed comparison of the classification performance of various orthogonal and biorthogonal wavelet transforms is also provided. Finally, the DWF feature extraction technique is incorporated into a simple multicomponent texture segmentation algorithm, and some illustrative examples are presented},
  number = {11},
  journaltitle = {IEEE Transactions on Image Processing},
  date = {1995-11},
  pages = {1549-1560},
  keywords = {Discrete wavelet transforms,Frequency,Gabor filters,Iterative algorithms,Spline,Wavelet analysis,Wavelet packets,biorthogonal wavelet transforms,channel variances,discrete wavelet frame approach,fast iterative algorithm,filter bank,image classification,image segmentation,image texture,l2 Brodatz textures,multicomponent texture segmentation algorithm,orthogonal wavelet transforms,overcomplete wavelet decomposition,texture classification,tight frame,transforms,translation invariant description,wavelet transforms,Feature extraction},
  author = {Unser, M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Unser_1995_Texture classification and segmentation using wavelet frames.pdf;/Users/fergalcotter/Zotero/storage/AQD3JN5J/abs_all.html},
  note = {01566}
}

@article{jaderberg_speeding_2014-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1405.3866},
  primaryClass = {cs},
  title = {Speeding up {{Convolutional Neural Networks}} with {{Low Rank Expansions}}},
  url = {http://arxiv.org/abs/1405.3866},
  abstract = {The focus of this paper is speeding up the evaluation of convolutional neural networks. While delivering impressive results across a range of computer vision and machine learning tasks, these networks are computationally demanding, limiting their deployability. Convolutional layers generally consume the bulk of the processing time, and so in this work we present two simple schemes for drastically speeding up these layers. This is achieved by exploiting cross-channel or filter redundancy to construct a low rank basis of filters that are rank-1 in the spatial domain. Our methods are architecture agnostic, and can be easily applied to existing CPU and GPU convolutional frameworks for tuneable speedup performance. We demonstrate this with a real world network designed for scene text character recognition, showing a possible 2.5x speedup with no loss in accuracy, and 4.5x speedup with less than 1\% drop in accuracy, still achieving state-of-the-art on standard benchmarks.},
  urldate = {2015-11-29},
  date = {2014-05-15},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Unread},
  author = {Jaderberg, Max and Vedaldi, Andrea and Zisserman, Andrew},
  file = {/Users/fergalcotter/Dropbox/Papers/Jaderberg et al_2014_Speeding up Convolutional Neural Networks with Low Rank Expansions.pdf;/Users/fergalcotter/Zotero/storage/TW8GH5VH/1405.html},
  note = {00043}
}

@article{bengio_early_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1510.02777},
  primaryClass = {cs},
  title = {Early {{Inference}} in {{Energy}}-{{Based Models Approximates Back}}-{{Propagation}}},
  url = {http://arxiv.org/abs/1510.02777},
  abstract = {We show that Langevin MCMC inference in an energy-based model with latent variables has the property that the early steps of inference, starting from a stationary point, correspond to propagating error gradients into internal layers, similarly to back-propagation. The error that is back-propagated is with respect to visible units that have received an outside driving force pushing them away from the stationary point. Back-propagated error gradients correspond to temporal derivatives of the activation of hidden units. This observation could be an element of a theory for explaining how brains perform credit assignment in deep hierarchies as efficiently as back-propagation does. In this theory, the continuous-valued latent variables correspond to averaged voltage potential (across time, spikes, and possibly neurons in the same minicolumn), and neural computation corresponds to approximate inference and error back-propagation at the same time.},
  urldate = {2016-02-22},
  date = {2015-10-09},
  keywords = {Computer Science - Learning},
  author = {Bengio, Yoshua and Fischer, Asja},
  file = {/Users/fergalcotter/Dropbox/Papers/Bengio_Fischer_2015_Early Inference in Energy-Based Models Approximates Back-Propagation.pdf;/Users/fergalcotter/Zotero/storage/5SHC5DNV/1510.html},
  note = {00001}
}

@article{szegedy_rethinking_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1512.00567},
  primaryClass = {cs},
  title = {Rethinking the {{Inception Architecture}} for {{Computer Vision}}},
  url = {http://arxiv.org/abs/1512.00567},
  abstract = {Convolutional networks are at the core of most state-of-the-art computer vision solutions for a wide variety of tasks. Since 2014 very deep convolutional networks started to become mainstream, yielding substantial gains in various benchmarks. Although increased model size and computational cost tend to translate to immediate quality gains for most tasks (as long as enough labeled data is provided for training), computational efficiency and low parameter count are still enabling factors for various use cases such as mobile vision and big-data scenarios. Here we explore ways to scale up networks in ways that aim at utilizing the added computation as efficiently as possible by suitably factorized convolutions and aggressive regularization. We benchmark our methods on the ILSVRC 2012 classification challenge validation set demonstrate substantial gains over the state of the art: 21.2\% top-1 and 5.6\% top-5 error for single frame evaluation using a network with a computational cost of 5 billion multiply-adds per inference and with using less than 25 million parameters. With an ensemble of 4 models and multi-crop evaluation, we report 3.5\% top-5 error on the validation set (3.6\% error on the test set) and 17.3\% top-1 error on the validation set.},
  urldate = {2016-08-26},
  date = {2015-12-01},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jonathon and Wojna, Zbigniew},
  file = {/Users/fergalcotter/Dropbox/Papers/Szegedy et al_2015_Rethinking the Inception Architecture for Computer Vision.pdf;/Users/fergalcotter/Zotero/storage/6T6TCT3J/1512.html},
  note = {00047}
}

@inproceedings{zeiler_visualizing_2014,
  langid = {english},
  location = {{Zurich}},
  title = {Visualizing and {{Understanding Convolutional Networks}}},
  isbn = {978-3-319-10589-5},
  url = {http://link.springer.com/chapter/10.1007/978-3-319-10590-1_53},
  abstract = {Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark Krizhevsky et al. [18]. However there is no clear understanding of why they perform so well, or how they might be improved. In this paper we explore both issues. We introduce a novel visualization technique that gives insight into the function of intermediate feature layers and the operation of the classifier. Used in a diagnostic role, these visualizations allow us to find model architectures that outperform Krizhevsky et al on the ImageNet classification benchmark. We also perform an ablation study to discover the performance contribution from different model layers. We show our ImageNet model generalizes well to other datasets: when the softmax classifier is retrained, it convincingly beats the current state-of-the-art results on Caltech-101 and Caltech-256 datasets.},
  eventtitle = {European {{Conference}} on {{Computer Vision}} ({{ECCV}})},
  urldate = {2015-11-18},
  date = {2014-09-06},
  pages = {818-833},
  keywords = {Artificial Intelligence (incl. Robotics),Computer Graphics,Image Processing and Computer Vision,Key Paper,pattern recognition},
  author = {Zeiler, Matthew D. and Fergus, Rob},
  file = {/Users/fergalcotter/Dropbox/Papers/Zeiler_Fergus_2014_Visualizing and Understanding Convolutional Networks.pdf},
  note = {00000}
}

@article{scellier_towards_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.05179},
  primaryClass = {cs},
  title = {Towards a {{Biologically Plausible Backprop}}},
  url = {http://arxiv.org/abs/1602.05179},
  abstract = {This work follows Bengio and Fischer (2015) in which theoretical foundations were laid to show how iterative inference can backpropagate error signals. Neurons move their activations towards configurations corresponding to lower energy and smaller prediction error: a new observation creates a perturbation at visible neurons that propagates into hidden layers, with these propagated perturbations corresponding to the back-propagated gradient. This avoids the need for a lengthy relaxation in the positive phase of training (when both inputs and targets are observed), as was believed with previous work on fixed-point recurrent networks. We show experimentally that energy-based neural networks with several hidden layers can be trained at discriminative tasks by using iterative inference and an STDP-like learning rule. The main result of this paper is that we can train neural networks with 1, 2 and 3 hidden layers on the permutation-invariant MNIST task and get the training error down to 0.00\%. The results presented here make it more biologically plausible that a mechanism similar to back-propagation may take place in brains in order to achieve credit assignment in deep networks. The paper also discusses some of the remaining open problems to achieve a biologically plausible implementation of backprop in brains.},
  urldate = {2016-02-22},
  date = {2016-02-16},
  keywords = {Computer Science - Learning},
  author = {Scellier, Benjamin and Bengio, Yoshua},
  file = {/Users/fergalcotter/Dropbox/Papers/Scellier_Bengio_2016_Towards a Biologically Plausible Backprop.pdf;/Users/fergalcotter/Zotero/storage/2DSXT6KF/1602.html},
  note = {00000}
}

@article{simonyan_very_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1409.1556},
  primaryClass = {cs},
  title = {Very {{Deep Convolutional Networks}} for {{Large}}-{{Scale Image Recognition}}},
  url = {http://arxiv.org/abs/1409.1556},
  abstract = {In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision.},
  urldate = {2016-08-08},
  date = {2014-09-04},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Simonyan, Karen and Zisserman, Andrew},
  file = {/Users/fergalcotter/Dropbox/Papers/Simonyan_Zisserman_2014_Very Deep Convolutional Networks for Large-Scale Image Recognition.pdf;/Users/fergalcotter/Zotero/storage/MNVDM3MC/1409.html},
  note = {01664}
}

@inproceedings{choi_hidden_2000,
  location = {{Istanbul, Turkey}},
  title = {Hidden {{Markov}} Tree Modeling of Complex Wavelet Transforms},
  volume = {1},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=861889},
  eventtitle = {2000 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}}, and {{Signal Processing}} ({{ICASSP}})},
  booktitle = {Proceedings of 2000 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}}, and {{Signal Processing}} ({{ICASSP}})},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2000-06},
  pages = {133--136},
  author = {Choi, Hyeokho and Romberg, Justin and Baraniuk, Richard and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Choi et al_2000_Hidden Markov tree modeling of complex wavelet transforms.pdf},
  note = {00114}
}

@article{lee_image_1996-1,
  title = {Image Representation Using {{2D Gabor}} Wavelets},
  volume = {18},
  issn = {0162-8828},
  doi = {10.1109/34.541406},
  abstract = {This paper extends to two dimensions the frame criterion developed by Daubechies for one-dimensional wavelets, and it computes the frame bounds for the particular case of 2D Gabor wavelets. Completeness criteria for 2D Gabor image representations are important because of their increasing role in many computer vision applications and also in modeling biological vision, since recent neurophysiological evidence from the visual cortex of mammalian brains suggests that the filter response profiles of the main class of linearly-responding cortical neurons (called simple cells) are best modeled as a family of self-similar 2D Gabor wavelets. We therefore derive the conditions under which a set of continuous 2D Gabor wavelets will provide a complete representation of any image, and we also find self-similar wavelet parametrization which allow stable reconstruction by summation as though the wavelets formed an orthonormal basis. Approximating a “tight frame” generates redundancy which allows low-resolution neural responses to represent high-resolution images},
  number = {10},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {1996-10},
  pages = {959-971},
  keywords = {2D Gabor wavelets,Application software,Biological system modeling,Biology computing,Brain modeling,Cells (biology),Continuous wavelet transforms,Gabor filters,Image reconstruction,Neurons,Useful,coarse coding,computer vision,frame bounds,frame criterion,neurophysiology,self-similar wavelet parametrization,visual cortex,wavelet transforms,Image coding,Image representation},
  author = {Lee, Tai Sing},
  file = {/Users/fergalcotter/Dropbox/Papers/Lee_1996_Image representation using 2D Gabor wavelets.pdf;/Users/fergalcotter/Zotero/storage/F5F3K2Q3/articleDetails.html},
  note = {01551}
}

@article{he_delving_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1502.01852},
  primaryClass = {cs},
  title = {Delving {{Deep}} into {{Rectifiers}}: {{Surpassing Human}}-{{Level Performance}} on {{ImageNet Classification}}},
  url = {http://arxiv.org/abs/1502.01852},
  shorttitle = {Delving {{Deep}} into {{Rectifiers}}},
  abstract = {Rectified activation units (rectifiers) are essential for state-of-the-art neural networks. In this work, we study rectifier neural networks for image classification from two aspects. First, we propose a Parametric Rectified Linear Unit (PReLU) that generalizes the traditional rectified unit. PReLU improves model fitting with nearly zero extra computational cost and little overfitting risk. Second, we derive a robust initialization method that particularly considers the rectifier nonlinearities. This method enables us to train extremely deep rectified models directly from scratch and to investigate deeper or wider network architectures. Based on our PReLU networks (PReLU-nets), we achieve 4.94\% top-5 test error on the ImageNet 2012 classification dataset. This is a 26\% relative improvement over the ILSVRC 2014 winner (GoogLeNet, 6.66\%). To our knowledge, our result is the first to surpass human-level performance (5.1\%, Russakovsky et al.) on this visual recognition challenge.},
  urldate = {2016-08-07},
  date = {2015-02-06},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  file = {/Users/fergalcotter/Dropbox/Papers/He et al_2015_Delving Deep into Rectifiers.pdf;/Users/fergalcotter/Zotero/storage/U7WUXFQD/1502.html},
  note = {00401}
}

@incollection{orr_representing_2012,
  langid = {english},
  title = {Representing and {{Incorporating Prior Knowledge}} in {{Neural Network Training}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_16},
  abstract = {Preface The present section focuses on tricks for four important aspects in learning: (1) incorporation of prior knowledge, (2) choice of representation for the learning task, (3) unequal class prior distributions, and finally (4) large network training.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {231-233},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Orr, Geneviève B. and Müller, Klaus-Robert},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Orr_Müller_2012_Representing and Incorporating Prior Knowledge in Neural Network Training.pdf;/Users/fergalcotter/Zotero/storage/ST58X4FD/978-3-642-35289-8_16.html},
  doi = {10.1007/978-3-642-35289-8_16},
  note = {00000}
}

@thesis{anderson_phase-based_2007,
  title = {Phase-{{Based Object Matching}} Using {{Complex Wavelets}}},
  pagetotal = {217},
  institution = {{Cambridge}},
  date = {2007},
  author = {Anderson, Ryan},
  file = {/Users/fergalcotter/Dropbox/Papers/Anderson_2007_Phase-Based Object Matching using Complex Wavelets.pdf},
  note = {00000}
}

@book{abadi_tensorflow:_2015,
  title = {{{TensorFlow}}: {{Large}}-{{Scale Machine Learning}} on {{Heterogeneous Systems}}},
  url = {http://tensorflow.org/},
  date = {2015},
  author = {Abadi, Martín and Agarwal, Ashish and Barham, Paul and Brevdo, Eugene and Chen, Zhifeng and Citro, Craig and Corrado, Greg S. and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and {Sanjay Ghemawat} and {Ian Goodfellow} and {Andrew Harp} and {Geoffrey Irving} and {Michael Isard} and Jia, Yangqing and {Rafal Jozefowicz} and {Lukasz Kaiser} and {Manjunath Kudlur} and {Josh Levenberg} and {Dan Mané} and {Rajat Monga} and {Sherry Moore} and {Derek Murray} and {Chris Olah} and {Mike Schuster} and {Jonathon Shlens} and {Benoit Steiner} and {Ilya Sutskever} and {Kunal Talwar} and {Paul Tucker} and {Vincent Vanhoucke} and {Vijay Vasudevan} and {Fernanda Viégas} and {Oriol Vinyals} and {Pete Warden} and {Martin Wattenberg} and {Martin Wicke} and {Yuan Yu} and {Xiaoqiang Zheng}},
  note = {00002 
Software available from tensorflow.org}
}

@article{lowe_distinctive_2004,
  title = {Distinctive {{Image Features}} from {{Scale}}-{{Invariant Keypoints}}},
  volume = {60},
  abstract = {This paper presents a method for extracting distinctive invariant features from images that can be used to perform reliable matching between different views of an object or scene. The features are invariant to image scale and rotation, and are shown to provide robust matching across a a substantial range of affine distortion, change in 3D viewpoint, addition of noise, and change in illumination.},
  journaltitle = {International Journal of Computer Vision},
  date = {2004},
  pages = {91--110},
  author = {Lowe, David G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Lowe - 2004 - Distinctive Image Features from Scale-Invariant Ke.pdf;/Users/fergalcotter/Zotero/storage/5N8PKKU6/summary.html},
  note = {32503}
}

@article{smith_gradual_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1511.06951},
  primaryClass = {cs},
  title = {Gradual {{DropIn}} of {{Layers}} to {{Train Very Deep Neural Networks}}},
  url = {http://arxiv.org/abs/1511.06951},
  abstract = {We introduce the concept of dynamically growing a neural network during training. In particular, an untrainable deep network starts as a trainable shallow network and newly added layers are slowly, organically added during training, thereby increasing the network's depth. This is accomplished by a new layer, which we call DropIn. The DropIn layer starts by passing the output from a previous layer (effectively skipping over the newly added layers), then increasingly including units from the new layers for both feedforward and backpropagation. We show that deep networks, which are untrainable with conventional methods, will converge with DropIn layers interspersed in the architecture. In addition, we demonstrate that DropIn provides regularization during training in an analogous way as dropout. Experiments are described with the MNIST dataset and various expanded LeNet architectures, CIFAR-10 dataset with its architecture expanded from 3 to 11 layers, and on the ImageNet dataset with the AlexNet architecture expanded to 13 layers and the VGG 16-layer architecture.},
  urldate = {2016-01-25},
  date = {2015-11-21},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Smith, Leslie N. and Hand, Emily M. and Doster, Timothy},
  file = {/Users/fergalcotter/Dropbox/Papers/Smith et al_2015_Gradual DropIn of Layers to Train Very Deep Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/VXT8CHCE/1511.html},
  note = {00000}
}

@inproceedings{sifre_rotation_2013,
  location = {{Portland, Oregon}},
  title = {Rotation, {{Scaling}} and {{Deformation Invariant Scattering}} for {{Texture Discrimination}}},
  doi = {10.1109/CVPR.2013.163},
  abstract = {An affine invariant representation is constructed with a cascade of invariants, which preserves information for classification. A joint translation and rotation invariant representation of image patches is calculated with a scattering transform. It is implemented with a deep convolution network, which computes successive wavelet transforms and modulus non-linearities. Invariants to scaling, shearing and small deformations are calculated with linear operators in the scattering domain. State-of-the-art classification results are obtained over texture databases with uncontrolled viewing conditions.},
  eventtitle = {2013 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2013 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2013-06},
  pages = {1233-1240},
  keywords = {affine,affine invariant representation,Classification,Computer architecture,Convolution,deep,deep convolution network,deformation,deformations,image classification,image patches,Image representation,image texture,information preservation,invariant,joint translation-rotation invariant representation,Joints,Key Paper,linear operators,modulus nonlinearities,network,neural,rotation,roto-translation,scaling,Scattering,scattering domain,scattering transform,shearing,Similar Work,state-of-the-art classification,successive wavelet transforms,texture,texture databases,texture discrimination,translation,uncontrolled viewing conditions,Unread,visual databases,wavelet,wavelet transforms},
  author = {Sifre, L. and Mallat, S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Sifre_Mallat_2013_Rotation, Scaling and Deformation Invariant Scattering for Texture.pdf;/Users/fergalcotter/Zotero/storage/SU39JVZ2/abs_all.html},
  note = {00053}
}

@article{olshausen_sparse_1997,
  langid = {english},
  title = {Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by {{V1}}?},
  volume = {37},
  issn = {0042-6989},
  shorttitle = {Sparse Coding with an Overcomplete Basis Set},
  abstract = {The spatial receptive fields of simple cells in mammalian striate cortex have been reasonably well described physiologically and can be characterized as being localized, oriented, and bandpass, comparable with the basis functions of wavelet transforms. Previously, we have shown that these receptive field properties may be accounted for in terms of a strategy for producing a sparse distribution of output activity in response to natural images. Here, in addition to describing this work in a more expansive fashion, we examine the neurobiological implications of sparse coding. Of particular interest is the case when the code is overcomplete--i.e., when the number of code elements is greater than the effective dimensionality of the input space. Because the basis functions are non-orthogonal and not linearly independent of each other, sparsifying the code will recruit only those basis functions necessary for representing a given input, and so the input-output function will deviate from being purely linear. These deviations from linearity provide a potential explanation for the weak forms of non-linearity observed in the response properties of cortical simple cells, and they further make predictions about the expected interactions among units in response to naturalistic stimuli.},
  number = {23},
  journaltitle = {Vision Research},
  shortjournal = {Vision Res.},
  date = {1997-12},
  pages = {3311-3325},
  keywords = {Algorithms,Animals,Mammals,Models; Psychological,Visual perception,visual cortex},
  author = {Olshausen, B. A. and Field, D. J.},
  file = {/Users/fergalcotter/Dropbox/Papers/olshausen_field_1997 sparse coding with an overcomplete basis set.pdf},
  eprinttype = {pmid},
  eprint = {9425546},
  note = {02433}
}

@article{oyallon_generic_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.5940},
  primaryClass = {cs},
  title = {Generic {{Deep Networks}} with {{Wavelet Scattering}}},
  url = {http://arxiv.org/abs/1312.5940},
  abstract = {We introduce a two-layer wavelet scattering network, for object classification. This scattering transform computes a spatial wavelet transform on the first layer and a new joint wavelet transform along spatial, angular and scale variables in the second layer. Numerical experiments demonstrate that this two layer convolution network, which involves no learning and no max pooling, performs efficiently on complex image data sets such as CalTech, with structural objects variability and clutter. It opens the possibility to simplify deep neural network learning by initializing the first layers with wavelet filters.},
  urldate = {2015-11-30},
  date = {2013-12-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Key Paper,Similar Work,Unread},
  author = {Oyallon, Edouard and Mallat, Stéphane and Sifre, Laurent},
  file = {/Users/fergalcotter/Dropbox/Papers/Oyallon et al. - 2013 - Generic Deep Networks with Wavelet Scattering.pdf;/Users/fergalcotter/Zotero/storage/JCHJ7A4C/1312.html},
  note = {00006}
}

@article{hinton_reducing_2006,
  langid = {english},
  title = {Reducing the {{Dimensionality}} of {{Data}} with {{Neural Networks}}},
  volume = {313},
  issn = {0036-8075, 1095-9203},
  url = {http://science.sciencemag.org/content/313/5786/504},
  doi = {10.1126/science.1127647},
  abstract = {High-dimensional data can be converted to low-dimensional codes by training a multilayer neural network with a small central layer to reconstruct high-dimensional input vectors. Gradient descent can be used for fine-tuning the weights in such “autoencoder” networks, but this works well only if the initial weights are close to a good solution. We describe an effective way of initializing the weights that allows deep autoencoder networks to learn low-dimensional codes that work much better than principal components analysis as a tool to reduce the dimensionality of data.
Neural networks can be used to reduce accurately high-dimensional data to lower dimensional representations for pattern recognition tasks.
Neural networks can be used to reduce accurately high-dimensional data to lower dimensional representations for pattern recognition tasks.},
  number = {5786},
  journaltitle = {Science},
  urldate = {2016-07-11},
  date = {2006-07-28},
  pages = {504-507},
  author = {Hinton, G. E. and Salakhutdinov, R. R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Hinton_Salakhutdinov_2006_Reducing the Dimensionality of Data with Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/SEZSPW79/504.html},
  eprinttype = {pmid},
  eprint = {16873662},
  note = {03626}
}

@inproceedings{rivaz_bayesian_2001,
  location = {{Thessaloniki, Greece}},
  title = {Bayesian Image Deconvolution and Denoising Using Complex Wavelets},
  volume = {2},
  doi = {10.1109/ICIP.2001.958477},
  abstract = {This paper proposes a new algorithm for image restoration (deconvolution and denoising) which employs the recently developed dual-tree complex wavelet transform in an iterative Bayesian framework. Complex wavelets are selected for their key features: shift invariance, directional selectivity and efficiency. The aim is to find an optimal description of the restored image in the complex wavelet domain, which minimises a quadratic energy function of the wavelet coefficients. The algorithm searches for this minimum using an efficient conjugate gradient method. We show that this can improve the SNR performance of a good minimax deconvolution method, WaRD, which is used to initialise the iterations, by typically 1.2 dB. Convergence is quite rapid, achieving 80\% of the ultimate performance gain in about 20 iterations. Each iteration takes around 5 seconds using MatLab on a 400 MHz Pentium computer with 256×256 pixel images},
  eventtitle = {2001 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  booktitle = {2001 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  publisher = {{IEEE}},
  date = {2001-10},
  pages = {273-276 vol.2},
  keywords = {256 pixel,5 sec,Bayes methods,Bayesian methods,conjugate gradient method,conjugate gradient methods,convergence of numerical methods,deconvolution,directional selectivity,dual-tree complex wavelet transform,Gradient methods,image deconvolution,image denoising,image restoration,interference suppression,Iterative algorithms,iterative Bayesian framework,minimax deconvolution method,minimax techniques,Noise reduction,quadratic energy function,shift invariance,SNR performance,trees (mathematics),Wavelet coefficients,Wavelet domain,wavelet transforms,wavelet-based regularised deconvolution},
  author = {de Rivaz, P. and Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rivaz_Kingsbury_2001_Bayesian image deconvolution and denoising using complex wavelets.pdf;/Users/fergalcotter/Zotero/storage/8H7X669X/abs_all.html},
  note = {00044}
}

@inproceedings{regli_scattering_2016,
  title = {Scattering Convolutional Hidden {{Markov}} Trees},
  doi = {10.1109/ICIP.2016.7532685},
  abstract = {We here combine the rich, overcomplete signal representation afforded by the scattering transform together with a probabilistic graphical model which captures hierarchical dependencies between coefficients at different layers. The wavelet scattering network result in a high-dimensional representation which is translation invariant and stable to deformations whilst preserving informative content. Such properties are achieved by cascading wavelet transform convolutions with non-linear modulus and averaging operators. The network structure and its distributions are described using a Hidden Markov Tree. This yields a generative model for high-dimensional inference and offers a means to perform various inference tasks such as prediction. Our proposed scattering convolutional hidden Markov tree displays promising results on classification tasks of complex images in the challenging case where the number of training examples is extremely small.},
  eventtitle = {2016 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  booktitle = {2016 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  date = {2016-09},
  pages = {1883-1887},
  keywords = {Classification,Convolution,Deep network,Hidden Markov Model,Hidden Markov models,Scattering,Scattering network,Standards,Training,wavelet transforms},
  author = {Regli, J. B. and Nelson, J. D. B.},
  file = {/Users/fergalcotter/Dropbox/Papers/Regli_Nelson_2016_Scattering convolutional hidden Markov trees.pdf;/Users/fergalcotter/Zotero/storage/6X73VACF/7532685.html},
  note = {00000}
}

@incollection{jaderberg_spatial_2015,
  title = {Spatial {{Transformer Networks}}},
  url = {http://papers.nips.cc/paper/5854-spatial-transformer-networks.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 28},
  publisher = {{Curran Associates, Inc.}},
  urldate = {2016-02-01},
  date = {2015},
  pages = {2008--2016},
  author = {Jaderberg, Max and Simonyan, Karen and Zisserman, Andrew and {kavukcuoglu}, koray},
  editor = {Cortes, C. and Lawrence, N. D. and Lee, D. D. and Sugiyama, M. and Garnett, R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Jaderberg et al_2015_Spatial Transformer Networks.pdf;/Users/fergalcotter/Zotero/storage/GH2CU4V8/5854-spatial-transformer-networks.html},
  note = {00022}
}

@incollection{coifman_translation-invariant_1995,
  langid = {english},
  title = {Translation-{{Invariant De}}-{{Noising}}},
  isbn = {978-0-387-94564-4},
  url = {http://link.springer.com/chapter/10.1007/978-1-4612-2544-7_9},
  abstract = {De-Noising with the traditional (orthogonal, maximally-decimated) wavelet transform sometimes exhibits visual artifacts; we attribute some of these—for example, Gibbs phenomena in the neighborhood of discontinuities—to the lack of translation invariance of the wavelet basis. One method to suppress such artifacts, termed “cycle spinning” by Coifman, is to “average out” the translation dependence. For a range of shifts, one shifts the data (right or left as the case may be), De-Noises the shifted data, and then unshifts the de-noised data. Doing this for each of a range of shifts, and averaging the several results so obtained, produces a reconstruction subject to far weaker Gibbs phenomena than thresholding based De-Noising using the traditional orthogonal wavelet transform. Cycle-Spinning over the range ofall circulant shifts can be accomplished in ordernlog2(n) time; it is equivalent to de-noising using the undecimated or stationary wavelet transform. Cycle-spinning exhibits benefits outside of wavelet de-noising, for example in cosine packet denoising, where it helps suppress ‘clicks’. It also has a counterpart in frequency domain de-noising, where the goal of translation-invariance is replaced by modulation invariance, and the central shift-De-Noise-unshift operation is replaced by modulate-De-Noise-demodulate. We illustrate these concepts with extensive computational examples; all figures presented here are reproducible using the WaveLab software},
  number = {103},
  booktitle = {Wavelets and {{Statistics}}},
  series = {Lecture {{Notes}} in {{Statistics}}},
  publisher = {{Springer New York}},
  urldate = {2016-08-14},
  date = {1995},
  pages = {125-150},
  keywords = {Probability Theory and Stochastic Processes},
  author = {Coifman, R. R. and Donoho, D. L.},
  editor = {Antoniadis, Anestis and Oppenheim, Georges},
  file = {/Users/fergalcotter/Dropbox/Papers/Coifman_Donoho_1995_Translation-Invariant De-Noising.pdf;/Users/fergalcotter/Zotero/storage/HRX5A8X8/10.html},
  doi = {10.1007/978-1-4612-2544-7_9},
  note = {02529}
}

@incollection{rognvaldsson_simple_2012,
  langid = {english},
  title = {A {{Simple Trick}} for {{Estimating}} the {{Weight Decay Parameter}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_6},
  abstract = {We present a simple trick to get an approximate estimate of the weight decay parameter λ. The method combines early stopping and weight decay, into the estimate ˆλ=∥∇E(Wes)∥/∥2Wes∥,λ̂ =∥∇E(Wes)∥/∥2Wes∥, \textbackslash{}hat\textbackslash{}lambda = \textbackslash{}parallel \textbackslash{}nabla E(W\_\{es\})\textbackslash{}parallel /\textbackslash{}parallel 2W\_\{es\}\textbackslash{}parallel, where W es is the set of weights at the early stopping point, and E(W) is the training data fit error. The estimate is demonstrated and compared to the standard cross-validation procedure for λ selection on one synthetic and four real life data sets. The result is that ˆλ\textbackslash{}hat\textbackslash{}lambda is as good an estimator for the optimal weight decay parameter value as the standard search estimate, but orders of magnitude quicker to compute. The results also show that weight decay can produce solutions that are significantly superior to committees of networks trained with early stopping.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {69-89},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Rögnvaldsson, Thorsteinn S.},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Rögnvaldsson_2012_A Simple Trick for Estimating the Weight Decay Parameter.pdf;/Users/fergalcotter/Zotero/storage/VCF74W8V/978-3-642-35289-8_6.html},
  doi = {10.1007/978-3-642-35289-8_6},
  note = {00000}
}

@article{mullen_contrast_1985,
  langid = {english},
  title = {The Contrast Sensitivity of Human Colour Vision to Red-Green and Blue-Yellow Chromatic Gratings.},
  volume = {359},
  issn = {1469-7793},
  url = {http://onlinelibrary.wiley.com/doi/10.1113/jphysiol.1985.sp015591/abstract},
  doi = {10.1113/jphysiol.1985.sp015591},
  abstract = {A method of producing red-green and blue-yellow sinusoidal chromatic gratings is used which permits the correction of all chromatic aberrations. A quantitative criterion is adopted to choose the intensity match of the two colours in the stimulus: this is the intensity ratio at which contrast sensitivity for the chromatic grating differs most from the contrast sensitivity for a monochromatic luminance grating. Results show that this intensity match varies with spatial frequency and does not necessarily correspond to a luminance match between the colours. Contrast sensitivities to the chromatic gratings at the criterion intensity match are measured as a function of spatial frequency, using field sizes ranging from 2 to 23 deg. Both blue-yellow and red-green contrast sensitivity functions have similar low-pass characteristics, with no low-frequency attenuation even at low frequencies below 0.1 cycles/deg. These functions indicate that the limiting acuities based on red-green and blue-yellow colour discriminations are similar at 11 or 12 cycles/deg. Comparisons between contrast sensitivity functions for the chromatic and monochromatic gratings are made at the same mean luminances. Results show that, at low spatial frequencies below 0.5 cycles/deg, contrast sensitivity is greater to the chromatic gratings, consisting of two monochromatic gratings added in antiphase, than to either monochromatic grating alone. Above 0.5 cycles/deg, contrast sensitivity is greater to monochromatic than to chromatic gratings.},
  number = {1},
  journaltitle = {The Journal of Physiology},
  urldate = {2016-08-06},
  date = {1985-02-01},
  pages = {381-400},
  author = {Mullen, K T},
  file = {/Users/fergalcotter/Dropbox/Papers/Mullen_1985_The contrast sensitivity of human colour vision to red-green and blue-yellow.pdf;/Users/fergalcotter/Zotero/storage/IU6IBISP/abstract.html},
  note = {00868}
}

@incollection{anderson_determining_2005,
  langid = {english},
  title = {Determining {{Multiscale Image Feature Angles}} from {{Complex Wavelet Phases}}},
  isbn = {978-3-540-29069-8},
  url = {http://link.springer.com/chapter/10.1007/11559573_61},
  abstract = {In this paper, we introduce a new multiscale representation for 2-D images named the Inter-Coefficient Product (ICP). The ICP is a decimated pyramid of complex values based on the Dual-Tree Complex Wavelet Transform (DT-CWT). The complex phases of its coefficients correspond to the angles of dominant directional features in their support regions. As a sparse representation of this information, the ICP is relatively simple to calculate and is a computationally efficient representation for subsequent analysis in computer vision activities or large data set analysis. Examples of ICP decomposition show its ability to provide an intuitive representation of multiscale features (such as edges and ridges). Its potential uses are then discussed.},
  number = {3656},
  booktitle = {Image {{Analysis}} and {{Recognition}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2015-11-04},
  date = {2005-09-28},
  pages = {490-498},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Computer Graphics,Computer Imaging; Vision; Pattern Recognition and Graphics,Image Processing and Computer Vision,pattern recognition},
  author = {Anderson, Ryan and Kingsbury, Nick and Fauqueur, Julien},
  editor = {Kamel, Mohamed and Campilho, Aurélio},
  file = {/Users/fergalcotter/Dropbox/Papers/Anderson et al_2005_Determining Multiscale Image Feature Angles from Complex Wavelet Phases_2.pdf;/Users/fergalcotter/Zotero/storage/IHCRWXEP/10.html},
  note = {00026}
}

@article{hubel_receptive_1959,
  title = {Receptive Fields of Single Neurones in the Cat's Striate Cortex},
  volume = {148},
  issn = {0022-3751},
  url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1363130/},
  number = {3},
  journaltitle = {The Journal of Physiology},
  shortjournal = {J Physiol},
  urldate = {2016-07-26},
  date = {1959-10},
  pages = {574-591},
  author = {Hubel, D. H. and Wiesel, T. N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Hubel_Wiesel_1959_Receptive fields of single neurones in the cat's striate cortex.pdf},
  eprinttype = {pmid},
  eprint = {14403679},
  pmcid = {PMC1363130},
  note = {03157}
}

@thesis{matthew_zeiler_hierarchical_2014,
  location = {{Department of Computer Science}},
  title = {Hierarchical {{Convolutional Deep Learning}} in {{Computer Vision}}},
  abstract = {It has long been the goal in computer vision to learn a hierarchy of features useful for object recognition. Spanning the two traditional paradigms of machine learning, unsupervised and supervised learning, we investigate the application of deep learning methods to tackle this challenging task and to learn robust representations of images. 

We begin our investigation with the introduction of a novel unsupervised learning technique called deconvolutional networks. Based on convolutional sparse coding, we show this model learns interesting decompositions of images into parts without object label information. This method, which easily scales to large images, becomes increasingly invariant by learning multiple layers of feature extraction coupled with pooling layers. We
introduce a novel pooling method called Gaussian pooling to enable these layers to store continuous location information while being differentiable, creating a unified objective function to optimize.

In the supervised learning domain, a well-established model for recognition of objects is the convolutional network. We introduce a new regularization method for convolutional networks called stochastic pooling which relies on sampling noise to prevent these powerful models from overfitting. Additionally, we show novel visualizations of these complex models to better understand what they learn and to provide insight on how to develop state-of-the-art architectures for large-scale classification of 1,000 different
object categories.

We also investigate related problems in deep learning. First, we introduce a model for the task of mapping one high dimensional time series sequence onto another. Second, we address the choice of nonlinearity in neural networks, showing evidence that rectified linear units outperform others types in automatic speech recognition. Finally, we introduce a novel optimization method called ADADELTA which shows promising convergence speeds in practice whilst being robust to hyper-parameter selection.},
  pagetotal = {226},
  institution = {{New York University}},
  date = {2014-01},
  author = {{Matthew Zeiler}},
  file = {/Users/fergalcotter/Dropbox/Papers/Matthew Zeiler_2014_Hierarchical Convolutional Deep Learning in Computer Vision.pdf},
  note = {00000}
}

@inproceedings{osuna_improved_1997,
  title = {An Improved Training Algorithm for Support Vector Machines},
  doi = {10.1109/NNSP.1997.622408},
  abstract = {We investigate the problem of training a support vector machine (SVM) on a very large database in the case in which the number of support vectors is also very large. Training a SVM is equivalent to solving a linearly constrained quadratic programming (QP) problem in a number of variables equal to the number of data points. This optimization problem is known to be challenging when the number of data points exceeds few thousands. In previous work done by us as well as by other researchers, the strategy used to solve the large scale QP problem takes advantage of the fact that the expected number of support vectors is small ({$<$}3,000). Therefore, the existing algorithms cannot deal with more than a few thousand support vectors. In this paper we present a decomposition algorithm that is guaranteed to solve the QP problem and that does not make assumptions on the expected number of support vectors. In order to present the feasibility of our approach we consider a foreign exchange rate time series database with 110,000 data points that generates 100,000 support vectors},
  eventtitle = {Neural {{Networks}} for {{Signal Processing}} [1997] {{VII}}. {{Proceedings}} of the 1997 {{IEEE Workshop}}},
  booktitle = {Neural {{Networks}} for {{Signal Processing}} [1997] {{VII}}. {{Proceedings}} of the 1997 {{IEEE Workshop}}},
  date = {1997-09},
  pages = {276-285},
  keywords = {Classification algorithms,Exchange rates,Large-scale systems,Learning systems,Minimization methods,Neural networks,Polynomials,Support vector machine classification,decomposition algorithm,financial data processing,foreign exchange rate,optimization,pattern classification,quadratic programming,support vector machine,support vector machines,time series,training algorithm,very large database,very large databases},
  author = {Osuna, E. and Freund, R. and Girosi, F.},
  file = {/Users/fergalcotter/Dropbox/Papers/Osuna et al_1997_An improved training algorithm for support vector machines.pdf;/Users/fergalcotter/Zotero/storage/E7VFH9MG/abs_all.html},
  note = {01320}
}

@incollection{orr_improving_2012,
  langid = {english},
  title = {Improving {{Network Models}} and {{Algorithmic Tricks}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_10},
  abstract = {Preface This section contains 5 chapters presenting easy to implement tricks which modify either the architecture and/or the learning algorithm so as to enhance the network’s modeling ability. Better modeling means better solutions in less time.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {139-141},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Orr, Geneviève B. and Müller, Klaus-Robert},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Orr_Müller_2012_Improving Network Models and Algorithmic Tricks.pdf;/Users/fergalcotter/Zotero/storage/T3TDNS8C/978-3-642-35289-8_10.html},
  doi = {10.1007/978-3-642-35289-8_10},
  note = {00000}
}

@article{waldspurger_phase_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1512.07024},
  primaryClass = {math},
  title = {Phase Retrieval for Wavelet Transforms},
  url = {http://arxiv.org/abs/1512.07024},
  abstract = {We describe a new algorithm to solve a particular phase retrieval problem, that has wide applications in audio processing: the reconstruction of a function from its scalogram, that is from the modulus of its wavelet transform. It is a multiscale iterative algorithm. To efficiently propagate phase information from low to high frequencies, it uses an equivalent formulation of the phase retrieval problem that involves the holomorphic extension of the wavelet transform. Our numerical results, on audio and non-audio signals, show that reconstruction is precise and stable to noise. The algorithm has a linear complexity in the size of the signal, up to logarithmic factors, and can thus be used on large signals.},
  urldate = {2016-02-01},
  date = {2015-12-22},
  keywords = {Mathematics - Optimization and Control,Unread},
  author = {Waldspurger, Irène},
  file = {/Users/fergalcotter/Dropbox/Papers/Waldspurger_2015_Phase retrieval for wavelet transforms.pdf;/Users/fergalcotter/Zotero/storage/7GK68JXR/1512.html},
  note = {00000}
}

@unpublished{vedaldi_image_2014,
  title = {Image Representations from Shallow to Deep},
  eventtitle = {{{BMVC}} 2014},
  date = {2014},
  author = {Vedaldi, Andrea},
  file = {/Users/fergalcotter/Dropbox/Papers/CNNs/vedaldi14bmvc-tutorial.pdf},
  note = {00000}
}

@article{agostinelli_learning_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1412.6830},
  primaryClass = {cs, stat},
  title = {Learning {{Activation Functions}} to {{Improve Deep Neural Networks}}},
  url = {http://arxiv.org/abs/1412.6830},
  abstract = {Artificial neural networks typically have a fixed, non-linear activation function at each neuron. We have designed a novel form of piecewise linear activation function that is learned independently for each neuron using gradient descent. With this adaptive activation function, we are able to improve upon deep neural network architectures composed of static rectified linear units, achieving state-of-the-art performance on CIFAR-10 (7.51\%), CIFAR-100 (30.83\%), and a benchmark from high-energy physics involving Higgs boson decay modes.},
  urldate = {2016-08-08},
  date = {2014-12-21},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  author = {Agostinelli, Forest and Hoffman, Matthew and Sadowski, Peter and Baldi, Pierre},
  file = {/Users/fergalcotter/Dropbox/Papers/Agostinelli et al_2014_Learning Activation Functions to Improve Deep Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/57FFSQAC/1412.html},
  note = {00023}
}

@article{mallat_theory_1989,
  title = {A Theory for Multiresolution Signal Decomposition: The Wavelet Representation},
  volume = {11},
  issn = {0162-8828},
  doi = {10.1109/34.192463},
  shorttitle = {A Theory for Multiresolution Signal Decomposition},
  abstract = {Multiresolution representations are effective for analyzing the information content of images. The properties of the operator which approximates a signal at a given resolution were studied. It is shown that the difference of information between the approximation of a signal at the resolutions 2j+1 and 2j (where j is an integer) can be extracted by decomposing this signal on a wavelet orthonormal basis of L2(Rn), the vector space of measurable, square-integrable n-dimensional functions. In L2(R), a wavelet orthonormal basis is a family of functions which is built by dilating and translating a unique function ψ(x). This decomposition defines an orthogonal multiresolution representation called a wavelet representation. It is computed with a pyramidal algorithm based on convolutions with quadrature mirror filters. Wavelet representation lies between the spatial and Fourier domains. For images, the wavelet representation differentiates several spatial orientations. The application of this representation to data compression in image coding, texture discrimination and fractal analysis is discussed},
  number = {7},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {1989-07},
  pages = {674-693},
  keywords = {Convolution,Convolutional codes,Data mining,Filters,Image analysis,Image resolution,Information analysis,Mirrors,Signal resolution,Spatial resolution,convolutions,data compression,fractal analysis,multiresolution signal decomposition,pattern recognition,picture processing,pyramidal algorithm,quadrature mirror filters,texture discrimination,wavelet representation,Encoding,Image coding},
  author = {Mallat, S.G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Mallat_1989_A theory for multiresolution signal decomposition_2.pdf;/Users/fergalcotter/Zotero/storage/8Q5NEGEV/abs_all.html},
  note = {20748}
}

@inproceedings{felix_grun_taxonomy_2016,
  location = {{Ney York, NY, USA}},
  title = {A {{Taxonomy}} and {{Library}} for {{Visualizing Learned Features}} in {{Convolutional Neural Networks}}},
  volume = {48},
  abstract = {Over the last decade, Convolutional Neural Networks
(CNN) saw a tremendous surge in performance.
However, understanding what a network
has learned still proves to be a challenging task.
To remedy this unsatisfactory situation, a number
of groups have recently proposed different methods
to visualize the learned models. In this work
we suggest a general taxonomy to classify and
compare these methods, subdividing the literature
into three main categories and providing researchers
with a terminology to base their works
on. Furthermore, we introduce the FeatureVis
library for MatConvNet: an extendable, easy to
use open source library for visualizing CNNs. It
contains implementations from each of the three
main classes of visualization methods and serves
as a useful tool for an enhanced understanding
of the features learned by intermediate layers, as
well as for the analysis of why a network might
fail for certain examples.},
  eventtitle = {International {{Conference}} on {{Machine Learning}}},
  date = {2016-06},
  author = {{Felix Grün} and {Christian Rupprecht} and {Nassir Navab} and {Federico Tombari}},
  file = {/Users/fergalcotter/Dropbox/Papers/Felix Grün et al_2016_A Taxonomy and Library for Visualizing Learned Features in Convolutional Neural.pdf},
  note = {00000}
}

@article{keysers_deformation_2007,
  title = {Deformation {{Models}} for {{Image Recognition}}},
  volume = {29},
  issn = {0162-8828},
  doi = {10.1109/TPAMI.2007.1153},
  abstract = {We present the application of different nonlinear image deformation models to the task of image recognition. The deformation models are especially suited for local changes as they often occur in the presence of image object variability. We show that, among the discussed models, there is one approach that combines simplicity of implementation, low-computational complexity, and highly competitive performance across various real-world image recognition tasks. We show experimentally that the model performs very well for four different handwritten digit recognition tasks and for the classification of medical images, thus showing high generalization capacity. In particular, an error rate of 0.54 percent on the MNIST benchmark is achieved, as well as the lowest reported error rate, specifically 12.6 percent, in the 2005 international ImageCLEF evaluation of medical image specifically categorization.},
  number = {8},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {2007-08},
  pages = {1422-1435},
  keywords = {Algorithms,Artificial Intelligence,Biomedical imaging,Character recognition,Classification algorithms,Computer Simulation,Context modeling,Deformable models,Error analysis,Handwriting recognition,Humans,Image Interpretation; Computer-Assisted,Image Processing; Computer-Assisted,Image recognition,MNIST benchmark,Nonlinear Dynamics,Pattern Recognition; Automated,Pixel,Useful,handwritten character recognition,handwritten digit recognition,image alignment,image matching,image object variability,low-computational complexity,medical image categorization,medical image categorization.,medical image processing,medical images classification,nonlinear image deformation},
  author = {Keysers, D. and Deselaers, T. and Gollan, C. and Ney, H.},
  file = {/Users/fergalcotter/Dropbox/Papers/Keysers et al_2007_Deformation Models for Image Recognition.pdf;/Users/fergalcotter/Zotero/storage/2ZAQTWHQ/abs_all.html},
  note = {00152}
}

@article{lin_microsoft_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1405.0312},
  primaryClass = {cs},
  title = {Microsoft {{COCO}}: {{Common Objects}} in {{Context}}},
  url = {http://arxiv.org/abs/1405.0312},
  shorttitle = {Microsoft {{COCO}}},
  abstract = {We present a new dataset with the goal of advancing the state-of-the-art in object recognition by placing the question of object recognition in the context of the broader question of scene understanding. This is achieved by gathering images of complex everyday scenes containing common objects in their natural context. Objects are labeled using per-instance segmentations to aid in precise object localization. Our dataset contains photos of 91 objects types that would be easily recognizable by a 4 year old. With a total of 2.5 million labeled instances in 328k images, the creation of our dataset drew upon extensive crowd worker involvement via novel user interfaces for category detection, instance spotting and instance segmentation. We present a detailed statistical analysis of the dataset in comparison to PASCAL, ImageNet, and SUN. Finally, we provide baseline performance analysis for bounding box and segmentation detection results using a Deformable Parts Model.},
  urldate = {2016-08-13},
  date = {2014-05-01},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Bourdev, Lubomir and Girshick, Ross and Hays, James and Perona, Pietro and Ramanan, Deva and Zitnick, C. Lawrence and Dollár, Piotr},
  file = {/Users/fergalcotter/Dropbox/Papers/Lin et al_2014_Microsoft COCO.pdf;/Users/fergalcotter/Zotero/storage/ABP85MW6/1405.html},
  note = {00500}
}

@report{rahtz_hyperref_2012,
  title = {Hyperref - {{Hypertext}} Marks in {{Latex}}},
  date = {2012-11},
  author = {Rahtz, Sebastian and Oberdiek, Heiko},
  file = {/Users/fergalcotter/Dropbox/Papers/Rahtz_Oberdiek_2012_Hyperref - Hypertext marks in Latex_3.pdf;/Users/fergalcotter/Dropbox/Papers/Rahtz_Oberdiek_2012_Hyperref - Hypertext marks in Latex_4.pdf;/Users/fergalcotter/Dropbox/Papers/Rahtz_Oberdiek_2012_Hyperref - Hypertext marks in Latex.pdf},
  note = {00013}
}

@article{zhou_learning_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1512.04150},
  primaryClass = {cs},
  title = {Learning {{Deep Features}} for {{Discriminative Localization}}},
  url = {http://arxiv.org/abs/1512.04150},
  abstract = {In this work, we revisit the global average pooling layer proposed in [13], and shed light on how it explicitly enables the convolutional neural network to have remarkable localization ability despite being trained on image-level labels. While this technique was previously proposed as a means for regularizing training, we find that it actually builds a generic localizable deep representation that can be applied to a variety of tasks. Despite the apparent simplicity of global average pooling, we are able to achieve 37.1\% top-5 error for object localization on ILSVRC 2014, which is remarkably close to the 34.2\% top-5 error achieved by a fully supervised CNN approach. We demonstrate that our network is able to localize the discriminative image regions on a variety of tasks despite not being trained for them},
  urldate = {2016-09-26},
  date = {2015-12-13},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Zhou, Bolei and Khosla, Aditya and Lapedriza, Agata and Oliva, Aude and Torralba, Antonio},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhou et al_2015_Learning Deep Features for Discriminative Localization.pdf;/Users/fergalcotter/Zotero/storage/377G7ZCJ/1512.html},
  note = {00012}
}

@incollection{orr_introduction_2012,
  langid = {english},
  title = {Introduction},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_1},
  abstract = {It is our belief that researchers and practitioners acquire, through experience and word-of-mouth, techniques and heuristics that help them successfully apply neural networks to difficult real world problems. Often these “tricks” are theoretically well motivated. Sometimes they are the result of trial and error. However, their most common link is that they are usually hidden in people’s heads or in the back pages of space-constrained conference papers. As a result newcomers to the field waste much time wondering why their networks train so slowly and perform so poorly. This book is an outgrowth of a 1996 NIPS workshop called Tricks of the Trade whose goal was to begin the process of gathering and documenting these tricks. The interest that the workshop generated, motivated us to expand our collection and compile it into this book. Although we have no doubt that there are many tricks we have missed, we hope that what we have included will prove to be useful, particularly to those who are relatively new to the field. Each chapter contains one or more tricks presented by a given author (or authors). We have attempted to group related chapters into sections, though we recognize that the different sections are far from disjoint. Some of the chapters (e.g. 1,13,17) contain entire systems of tricks that are far more general than the category they have been placed in. Before each section we provide the reader with a summary of the tricks contained within, to serve as a quick overview and reference. However, we do not recommend applying tricks before having read the accompanying chapter. Each trick may only work in a particular context that is not fully explained in the summary. This is particularly true for the chapters that present systems where combinations of tricks must be applied together for them to be effective. Below we give a coarse roadmap of the contents of the individual chapters.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {1-5},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Orr, Geneviève B. and Müller, Klaus-Robert},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Orr_Müller_2012_Introduction.pdf;/Users/fergalcotter/Zotero/storage/TISNCPPH/978-3-642-35289-8_1.html},
  doi = {10.1007/978-3-642-35289-8_1},
  note = {00801}
}

@article{mallat_understanding_2016-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1601.04920},
  primaryClass = {cs, stat},
  title = {Understanding {{Deep Convolutional Networks}}},
  url = {http://arxiv.org/abs/1601.04920},
  abstract = {Deep convolutional networks provide state of the art classifications and regressions results over many high-dimensional problems. We review their architecture, which scatters data with a cascade of linear filter weights and non-linearities. A mathematical framework is introduced to analyze their properties. Computations of invariants involve multiscale contractions, the linearization of hierarchical symmetries, and sparse separations. Applications are discussed.},
  urldate = {2016-03-02},
  date = {2016-01-19},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Statistics - Machine Learning},
  author = {Mallat, Stéphane},
  file = {/Users/fergalcotter/Dropbox/Papers/Mallat_2016_Understanding Deep Convolutional Networks.pdf;/Users/fergalcotter/Zotero/storage/DPE3QHF2/1601.html},
  note = {00003}
}

@article{tariyal_greedy_2016-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.00203},
  primaryClass = {cs, stat},
  title = {Greedy {{Deep Dictionary Learning}}},
  url = {http://arxiv.org/abs/1602.00203},
  abstract = {In this work we propose a new deep learning tool called deep dictionary learning. Multi-level dictionaries are learnt in a greedy fashion, one layer at a time. This requires solving a simple (shallow) dictionary learning problem, the solution to this is well known. We apply the proposed technique on some benchmark deep learning datasets. We compare our results with other deep learning tools like stacked autoencoder and deep belief network; and state of the art supervised dictionary learning tools like discriminative KSVD and label consistent KSVD. Our method yields better results than all.},
  urldate = {2016-07-15},
  date = {2016-01-31},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Learning,Statistics - Machine Learning},
  author = {Tariyal, Snigdha and Majumdar, Angshul and Singh, Richa and Vatsa, Mayank},
  file = {/Users/fergalcotter/Dropbox/Papers/Tariyal et al_2016_Greedy Deep Dictionary Learning.pdf;/Users/fergalcotter/Zotero/storage/RGXUMD48/1602.html},
  note = {00002}
}

@article{ruderman_statistics_1994-1,
  title = {The Statistics of Natural Images},
  volume = {5},
  issn = {0954-898X},
  url = {http://www.tandfonline.com/doi/abs/10.1088/0954-898X_5_4_006},
  doi = {10.1088/0954-898X_5_4_006},
  abstract = {Recently there has been a resurgence of interest in the properties of natural images. Their statistics are important not only in image compression but also for the study of sensory processing in biology, which can be viewed as satisfying certain ‘design criteria’. This review summarizes previous work on image statistics and presents our own data. Perhaps the most notable property of natural images is an invariance to scale. We present data to support this claim as well as evidence for a hierarchical invariance in natural scenes. These symmetries provide a powerful description of natural images as they greatly restrict the class of allowed distributions.},
  number = {4},
  journaltitle = {Network: Computation in Neural Systems},
  shortjournal = {Network: Computation in Neural Systems},
  urldate = {2016-10-25},
  date = {1994-01-01},
  pages = {517-548},
  author = {Ruderman, Daniel L},
  file = {/Users/fergalcotter/Dropbox/Papers/Ruderman_1994_The statistics of natural images.pdf;/Users/fergalcotter/Zotero/storage/JP673H8P/0954-898X_5_4_006.html}
}

@inproceedings{geoff_hinton_what_2005,
  title = {What Kind of a Graphical Model Is the Brain?},
  abstract = {If neurons are treated as latent variables, our visual
systems are non-linear, densely-connected
graphical models containing billions of variables
and thousands of billions of parameters. Current
algorithms would have difficulty learning a
graphical model of this scale. Starting with an
algorithm that has difficulty learning more than
a few thousand parameters, I describe a series
of progressively better learning algorithms all of
which are designed to run on neuron-like hardware.
The latest member of this series can learn
deep, multi-layer belief nets quite rapidly. It turns
a generic network with three hidden layers and
1.7 million connections into a very good generative
model of handwritten digits. After learning,
the model gives classification performance that is
comparable to the best discriminative methods.},
  eventtitle = {International {{Joint Conference}} on {{Artificial Intelligence}}},
  date = {2005},
  pages = {1765-1775},
  author = {{Geoff Hinton}},
  note = {00000}
}

@incollection{shah_double_2015,
  title = {Double or {{Nothing}}: {{Multiplicative Incentive Mechanisms}} for {{Crowdsourcing}}},
  url = {http://papers.nips.cc/paper/5677-double-or-nothing-multiplicative-incentive-mechanisms-for-crowdsourcing.pdf},
  shorttitle = {Double or {{Nothing}}},
  booktitle = {Advances in {{Neural Information Processing Systems}} 28},
  publisher = {{Curran Associates, Inc.}},
  urldate = {2016-01-14},
  date = {2015},
  pages = {1--9},
  author = {Shah, Nihar Bhadresh and Zhou, Denny},
  editor = {Cortes, C. and Lawrence, N. D. and Lee, D. D. and Sugiyama, M. and Garnett, R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Shah_Zhou_2015_Double or Nothing.pdf;/Users/fergalcotter/Zotero/storage/2BIS2H9F/5677-double-or-nothing-multiplicative-incentive-mechanisms-for-crowdsourcing.html},
  note = {00004}
}

@article{hsu_comparison_2002,
  title = {A Comparison of Methods for Multiclass Support Vector Machines},
  volume = {13},
  issn = {1045-9227},
  doi = {10.1109/72.991427},
  abstract = {Support vector machines (SVMs) were originally designed for binary classification. How to effectively extend it for multiclass classification is still an ongoing research issue. Several methods have been proposed where typically we construct a multiclass classifier by combining several binary classifiers. Some authors also proposed methods that consider all classes at once. As it is computationally more expensive to solve multiclass problems, comparisons of these methods using large-scale problems have not been seriously conducted. Especially for methods solving multiclass SVM in one step, a much larger optimization problem is required so up to now experiments are limited to small data sets. In this paper we give decomposition implementations for two such "all-together" methods. We then compare their performance with three methods based on binary classifications: "one-against-all," "one-against-one," and directed acyclic graph SVM (DAGSVM). Our experiments indicate that the "one-against-one" and DAG methods are more suitable for practical use than the other methods. Results also show that for large problems methods by considering all data at once in general need fewer support vectors},
  number = {2},
  journaltitle = {IEEE Transactions on Neural Networks},
  date = {2002-03},
  pages = {415-425},
  keywords = {Computer science,DAGSVM,Large-scale systems,Optimization methods,SVMs,Support vector machine classification,Training data,binary classifiers,decomposition,directed acyclic graph SVM,learning automata,multiclass classification,optimization,pattern classification,support vector machines},
  author = {Hsu, Chih-Wei and Lin, Chih-Jen},
  file = {/Users/fergalcotter/Dropbox/Papers/Hsu_Lin_2002_A comparison of methods for multiclass support vector machines.pdf;/Users/fergalcotter/Zotero/storage/S73RA9K6/991427.html},
  note = {06170}
}

@report{gibson_theory_????,
  title = {The Theory of Affordances},
  author = {Gibson, James},
  file = {/Users/fergalcotter/Dropbox/Papers/Gibson_The theory of affordances.pdf},
  note = {00023}
}

@inproceedings{anderson_coarse-level_2005,
  location = {{Genova, Italy}},
  title = {Coarse-Level Object Recognition Using Interlevel Products of Complex Wavelets},
  volume = {1},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1529858},
  eventtitle = {2005 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  booktitle = {2005 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2005-09},
  pages = {I--745},
  author = {Anderson, Ryan and Kingsbury, Nick and Fauqueur, Julien},
  file = {/Users/fergalcotter/Dropbox/Papers/Anderson et al_2005_Coarse-level object recognition using interlevel products of complex wavelets.pdf},
  note = {00024}
}

@inproceedings{fei-fei_learning_2004,
  title = {Learning {{Generative Visual Models}} from {{Few Training Examples}}: {{An Incremental Bayesian Approach Tested}} on 101 {{Object Categories}}},
  doi = {10.1109/CVPR.2004.109},
  shorttitle = {Learning {{Generative Visual Models}} from {{Few Training Examples}}},
  abstract = {Current computational approaches to learning visual object categories require thousands of training images, are slow, cannot learn in an incremental manner and cannot incorporate prior information into the learning process. In addition, no algorithm presented in the literature has been tested on more than a handful of object categories. We present an method for learning object categories from just a few training images. It is quick and it uses prior information in a principled way. We test it on a dataset composed of images of objects belonging to 101 widely varied categories. Our proposed method is based on making use of prior information, assembled from (unrelated) object categories which were previously learnt. A generative probabilistic model is used, which represents the shape and appearance of a constellation of features belonging to the object. The parameters of the model are learnt incrementally in a Bayesian manner. Our incremental algorithm is compared experimentally to an earlier batch Bayesian algorithm, as well as to one based on maximum-likelihood. The incremental and batch versions have comparable classification performance on small training sets, but incremental learning is significantly faster, making real-time learning feasible. Both Bayesian methods outperform maximum likelihood on small training sets.},
  eventtitle = {Conference on {{Computer Vision}} and {{Pattern Recognition Workshop}}, 2004. {{CVPRW}} '04},
  booktitle = {Conference on {{Computer Vision}} and {{Pattern Recognition Workshop}}, 2004. {{CVPRW}} '04},
  date = {2004-06},
  pages = {178-178},
  keywords = {Assembly,Bayesian methods,Humans,Image databases,Image recognition,Machine vision,Maximum likelihood estimation,Parameter estimation,Shape,Testing},
  author = {Fei-Fei, Li and Fergus, R. and Perona, P.},
  file = {/Users/fergalcotter/Dropbox/Papers/Fei-Fei et al_2004_Learning Generative Visual Models from Few Training Examples.pdf;/Users/fergalcotter/Zotero/storage/IE6MBB2J/abs_all.html},
  note = {00989}
}

@article{kendall_posenet:_2015,
  title = {{{PoseNet}}: {{A Convolutional Network}} for {{Real}}-{{Time}} 6-{{DOF Camera Relocalization}}},
  volume = {2015},
  url = {http://arxiv.org/abs/1505.07427},
  shorttitle = {{{PoseNet}}},
  abstract = {We present a robust and real-time monocular six degree of freedom relocalization system. Our system trains a convolutional neural network to regress the 6-DOF camera pose from a single RGB image in an end-to-end manner with no need of additional engineering or graph optimisation. The algorithm can operate indoors and outdoors in real time, taking 5ms per frame to compute. It obtains approximately 2m and 3 degree accuracy for large scale outdoor scenes and 0.5m and 5 degree accuracy indoors. This is achieved using an efficient 23 layer deep convnet, demonstrating that convnets can be used to solve complicated out of image plane regression problems. This was made possible by leveraging transfer learning from large scale classification data. We show the convnet localizes from high level features and is robust to difficult lighting, motion blur and different camera intrinsics where point based SIFT registration fails. Furthermore we show how the pose feature that is produced generalizes to other scenes allowing us to regress pose with only a few dozen training examples. The dataset and an online demonstration is available on our project webpage, at http://mi.eng.cam.ac.uk/projects/relocalisation/},
  journaltitle = {International Conference on Computer Vision (ICCV)},
  urldate = {2015-12-01},
  date = {2015-05-27},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Neural and Evolutionary Computing,Computer Science - Robotics,Unread},
  author = {Kendall, Alex and Grimes, Matthew and Cipolla, Roberto},
  file = {/Users/fergalcotter/Dropbox/Papers/Kendall et al. - 2015 - PoseNet A Convolutional Network for Real-Time 6-D.pdf;/Users/fergalcotter/Zotero/storage/4M3B8JSP/1505.html},
  note = {00000}
}

@article{selesnick_dual-tree_2005,
  title = {The Dual-Tree Complex Wavelet Transform},
  volume = {22},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1550194},
  number = {6},
  journaltitle = {Signal Processing Magazine, IEEE},
  urldate = {2015-11-03},
  date = {2005},
  pages = {123--151},
  author = {Selesnick, Ivan W. and Baraniuk, Richard G. and Kingsbury, Nick G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Selesnick et al_2005_The dual-tree complex wavelet transform.pdf},
  note = {01602}
}

@article{wu_performance_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1407.6423},
  primaryClass = {cs},
  title = {Performance Evaluation of Wavelet Scattering Network in Image Texture Classification in Various Color Spaces},
  url = {http://arxiv.org/abs/1407.6423},
  abstract = {Texture plays an important role in many image analysis applications. In this paper, we give a performance evaluation of color texture classification by performing wavelet scattering network in various color spaces. Experimental results on the KTH\_TIPS\_COL database show that opponent RGB based wavelet scattering network outperforms other color spaces. Therefore, when dealing with the problem of color texture classification, opponent RGB based wavelet scattering network is recommended.},
  urldate = {2016-02-01},
  date = {2014-07-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Similar Work,Unread},
  author = {Wu, Jiasong and Jiang, Longyu and Han, Xu and Senhadji, Lotfi and Shu, Huazhong},
  file = {/Users/fergalcotter/Dropbox/Papers/Wu et al_2014_Performance evaluation of wavelet scattering network in image texture.pdf;/Users/fergalcotter/Zotero/storage/VKVTQNTZ/1407.html},
  note = {00000}
}

@article{szegedy_intriguing_2013-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.6199},
  primaryClass = {cs},
  title = {Intriguing Properties of Neural Networks},
  url = {http://arxiv.org/abs/1312.6199},
  abstract = {Deep neural networks are highly expressive models that have recently achieved state of the art performance on speech and visual recognition tasks. While their expressiveness is the reason they succeed, it also causes them to learn uninterpretable solutions that could have counter-intuitive properties. In this paper we report two such properties. First, we find that there is no distinction between individual high level units and random linear combinations of high level units, according to various methods of unit analysis. It suggests that it is the space, rather than the individual units, that contains of the semantic information in the high layers of neural networks. Second, we find that deep neural networks learn input-output mappings that are fairly discontinuous to a significant extend. We can cause the network to misclassify an image by applying a certain imperceptible perturbation, which is found by maximizing the network's prediction error. In addition, the specific nature of these perturbations is not a random artifact of learning: the same perturbation can cause a different network, that was trained on a different subset of the dataset, to misclassify the same input.},
  urldate = {2016-08-24},
  date = {2013-12-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Szegedy, Christian and Zaremba, Wojciech and Sutskever, Ilya and Bruna, Joan and Erhan, Dumitru and Goodfellow, Ian and Fergus, Rob},
  file = {/Users/fergalcotter/Dropbox/Papers/Szegedy et al_2013_Intriguing properties of neural networks.pdf;/Users/fergalcotter/Zotero/storage/PAHW52QC/1312.html},
  note = {00223}
}

@article{tygert_convolutional_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1506.08230},
  primaryClass = {cs},
  title = {Convolutional Networks and Learning Invariant to Homogeneous Multiplicative Scalings},
  url = {http://arxiv.org/abs/1506.08230},
  abstract = {The conventional classification schemes -- notably multinomial logistic regression -- used in conjunction with convolutional networks (convnets) are classical in statistics, designed without consideration for the usual coupling with convnets, stochastic gradient descent, and backpropagation. In the specific application to supervised learning for convnets, a simple scale-invariant classification stage turns out to be more robust than multinomial logistic regression, appears to result in slightly lower errors on several standard test sets, has similar computational costs, and features precise control over the actual rate of learning. "Scale-invariant" means that multiplying the input values by any nonzero scalar leaves the output unchanged.},
  urldate = {2016-02-07},
  date = {2015-06-26},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Tygert, Mark and Szlam, Arthur and Chintala, Soumith and Ranzato, Marc'Aurelio and Tian, Yuandong and Zaremba, Wojciech},
  file = {/Users/fergalcotter/Dropbox/Papers/Tygert et al_2015_Convolutional networks and learning invariant to homogeneous multiplicative.pdf;/Users/fergalcotter/Zotero/storage/VDA8AIHF/1506.html},
  note = {00000}
}

@inproceedings{girshick_rich_2014,
  location = {{Columbus, Ohio}},
  title = {Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation},
  abstract = {Object detection performance, as measured on the
canonical PASCAL VOC dataset, has plateaued in the last
few years. The best-performing methods are complex ensemble
systems that typically combine multiple low-level
image features with high-level context. In this paper, we
propose a simple and scalable detection algorithm that improves
mean average precision (mAP) by more than 30\%
relative to the previous best result on VOC 2012—achieving
a mAP of 53.3\%. Our approach combines two key insights:
(1) one can apply high-capacity convolutional neural networks
(CNNs) to bottom-up region proposals in order to
localize and segment objects and (2) when labeled training
data is scarce, supervised pre-training for an auxiliary task,
followed by domain-specific fine-tuning, yields a significant
performance boost. Since we combine region proposals
with CNNs, we call our method R-CNN: Regions with CNN
features. Source code for the complete system is available at
http://www.cs.berkeley.edu/˜rbg/rcnn.},
  eventtitle = {2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2014 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2014-06},
  keywords = {Unread},
  author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra},
  file = {/Users/fergalcotter/Dropbox/Papers/Girshick et al_2014_Rich feature hierarchies for accurate object detection and semantic segmentation.pdf},
  note = {00933}
}

@thesis{sifre_rigid-motion_2014-1,
  location = {{Paris, France}},
  title = {Rigid-{{Motion Scattering}} for {{Image Classification}}},
  abstract = {Image classification is the problem of assigning a label that best describes the content of
unknown images, given a set of training images with known labels. This thesis introduces
image classification algorithms based on the scattering transform, studies their properties
and describes extensive classification experiments on challenging texture and object image
datasets.
Images are high dimensional signals for which generic machine learning algorithms fail
when applied directly on the raw pixel space. Therefore, most successful approaches involve
building a specific low dimensional representation on which the classification is performed.
Traditionally, the representation was engineered to reduce the dimensionality of images by
building invariance to geometric transformations while retaining discriminative features.
More recently, deep convolutional networks have achieved state-of-the-art results on most
image classification tasks. Such networks progressively build more invariant representations
through a hierarchy of convolutional layers where all the weights are learned.
This thesis proposes several scattering representations. Those scattering representations
have a structure similar to convolutional networks, but the weights of scattering are
designed to provide mathematical guaranty of invariance to geometric transformations, stability
to deformations and energy preservation. In this thesis, we focus on affine and more
specifically on rigid-motion transformations, which consist in translations and rotations,
and which are common in real world images.
Translation scattering is a cascade of two dimensional wavelet modulus operators which
builds translation invariance. We propose a first separable rigid-motion separable scattering,
which applies a first scattering along the position variable to build translation invariance,
followed by a second scattering transform along the rotational orbits of the first
scattering, to build invariance to rotations.
As any separable representation, separable scattering has the advantage of simplicity
but also loses some information about the joint distribution of positions and orientations
in the intermediate layers of the representation. We define a joint rigid-motion scattering
which does retain this information. The joint scattering consists in a cascade of wavelet
modulus applied directly on the joint rigid-motion group. We introduce convolutions,
wavelets, a wavelet transform and scattering on the rigid-motion group and propose fast
implementations. Both separable and joint scattering are applied to texture image classi-
fication with state-of-the-art results on most available texture datasets.
Finally, we demonstrate the applicability of joint scattering and group convolutions
on generic object image datasets. It is shown that convolutional networks performances
are enhanced through the use of separable convolutions, similar to the rigid-motion convolutions.
Also, a non-invariant version of the rigid-motion scattering is demonstrated to
achieve results similar to those obtained by the first layers of convolutional networks.},
  pagetotal = {128},
  institution = {{Ecole Polytechnique}},
  type = {PhD Thesis},
  date = {2014-10},
  keywords = {Unread},
  author = {Sifre, Laurent},
  file = {/Users/fergalcotter/Dropbox/Papers/Sifre_2014_Rigid-Motion Scattering for Image Classification.pdf},
  note = {00000}
}

@article{sutskever_importance_2013,
  title = {On the Importance of Initialization and Momentum in Deep Learning},
  url = {https://www.researchgate.net/publication/286271944_On_the_importance_of_initialization_and_momentum_in_deep_learning},
  abstract = {Deep and recurrent neural networks (DNNs and RNNs respectively) are powerful models that were considered to be almost impossible to train using stochastic gradient descent with momentum. In this...},
  number = {3},
  journaltitle = {ResearchGate},
  urldate = {2016-08-07},
  date = {2013-01-01},
  pages = {1139-1147},
  author = {Sutskever, I. and Martens, J. and Dahl, G. and Hinton, G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Sutskever et al_2013_On the importance of initialization and momentum in deep learning.pdf;/Users/fergalcotter/Zotero/storage/UP78SIPZ/286271944_On_the_importance_of_initialization_and_momentum_in_deep_learning.html},
  note = {00337}
}

@article{zeiler_stochastic_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1301.3557},
  primaryClass = {cs, stat},
  title = {Stochastic {{Pooling}} for {{Regularization}} of {{Deep Convolutional Neural Networks}}},
  url = {http://arxiv.org/abs/1301.3557},
  abstract = {We introduce a simple and effective method for regularizing large convolutional neural networks. We replace the conventional deterministic pooling operations with a stochastic procedure, randomly picking the activation within each pooling region according to a multinomial distribution, given by the activities within the pooling region. The approach is hyper-parameter free and can be combined with other regularization approaches, such as dropout and data augmentation. We achieve state-of-the-art performance on four image datasets, relative to other approaches that do not utilize data augmentation.},
  urldate = {2016-08-09},
  date = {2013-01-15},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  author = {Zeiler, Matthew D. and Fergus, Rob},
  file = {/Users/fergalcotter/Dropbox/Papers/Zeiler_Fergus_2013_Stochastic Pooling for Regularization of Deep Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/6UA5KHCV/1301.html},
  note = {00204}
}

@article{miller_image_2008,
  langid = {english},
  title = {Image Denoising Using Derotated Complex Wavelet Coefficients},
  volume = {17},
  issn = {1057-7149},
  doi = {10.1109/TIP.2008.926146},
  abstract = {A method for removing additive Gaussian noise from digital images is described. It is based on statistical modeling of the coefficients of a redundant, oriented, complex multiscale transform. Two types of modeling are used to model the wavelet coefficients. Both are based on Gaussian scale mixture (GSM) modeling of neighborhoods of coefficients at adjacent locations and scales. Modeling of edge and ridge discontinuities is performed using wavelet coefficients derotated by twice the phase of the coefficient at the same location and the next coarser scale. Other areas are modeled using standard wavelet coefficients. An adaptive Bayesian model selection framework is used to determine the modeling applied to each neighborhood. The proposed algorithm succeeds in providing improved denoising performance at structural image features, reducing ringing artifacts and enhancing sharpness, while avoiding degradation in other areas. The method outperforms previously published methods visually and in standard tests.},
  number = {9},
  journaltitle = {IEEE Transactions on Image Processing},
  shortjournal = {IEEE Trans Image Process},
  date = {2008-09},
  pages = {1500-1511},
  keywords = {Algorithms,Artifacts,Computer Simulation,Image Enhancement,Image Interpretation; Computer-Assisted,Models; Statistical,Normal Distribution,Reproducibility of Results,rotation,Sensitivity and Specificity},
  author = {Miller, Mark and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Miller and Kingsbury - 2008 - Image denoising using derotated complex wavelet co.pdf},
  eprinttype = {pmid},
  eprint = {18701390},
  note = {00069}
}

@article{kingsbury_complex_2001,
  title = {Complex Wavelets for Shift Invariant Analysis and Filtering of Signals},
  volume = {10},
  doi = {10.1006/acha.2000.0343},
  abstract = {This paper describes a form of discrete wavelet transform, which generates complex coefficients by using a dual tree of wavelet filters to obtain their real and imaginary parts. This introduces limited redundancy (2(m) : 1 for m-dimensional signals) and allows the transform to provide approximate shift invariance and directionally selective filters (properties lacking in the traditional wavelet transform) while preserving the usual properties of perfect reconstruction and computational efficiency with good well-balanced frequency responses. Here we analyze why the new transform can be designed to be shift invariant and describe how to estimate the accuracy of this approximation and design suitable filters to achieve this. We discuss two different variants of the new transform, based on odd/even and quarter-sample shift (Q-shift) filters, respectively. We then describe briefly how the dual tree may be extended for images and other multi-dimensional signals, and finally summarize a range of applications of the transform that take advantage of its unique properties. (C) 2001 Academic Press.},
  number = {3},
  journaltitle = {Applied and Computational Harmonic Analysis},
  date = {2001-05},
  pages = {234-253},
  author = {Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_2001_Complex wavelets for shift invariant analysis and filtering of signals.pdf},
  note = {01524}
}

@book{simon_haykin_neural_2009,
  title = {Neural {{Networks}} and {{Learning Machines}}},
  edition = {3},
  isbn = {978-0-13-147139-9},
  publisher = {{Pearson Prentice Hall}},
  date = {2009},
  author = {{Simon Haykin}},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/[Simon_O._Haykin]_Neural_Networks_and_Learning_Mac(BookZZ.org).pdf},
  note = {03340}
}

@article{citti_cortical_2006,
  langid = {english},
  title = {A {{Cortical Based Model}} of {{Perceptual Completion}} in the {{Roto}}-{{Translation Space}}},
  volume = {24},
  issn = {0924-9907, 1573-7683},
  url = {http://link.springer.com/article/10.1007/s10851-005-3630-2},
  doi = {10.1007/s10851-005-3630-2},
  abstract = {We present a mathematical model of perceptual completion and formation of subjective surfaces, which is at the same time inspired by the architecture of the visual cortex, and is the lifting in the 3-dimensional rototranslation group of the phenomenological variational models based on elastica functional. The initial image is lifted by the simple cells to a surface in the rototranslation group and the completion process is modeled via a diffusion driven motion by curvature. The convergence of the motion to a minimal surface is proved. Results are presented both for modal and amodal completion in classic Kanizsa images.},
  number = {3},
  journaltitle = {Journal of Mathematical Imaging and Vision},
  shortjournal = {J Math Imaging Vis},
  urldate = {2016-08-03},
  date = {2006-02-09},
  pages = {307-326},
  author = {Citti, G. and Sarti, A.},
  file = {/Users/fergalcotter/Dropbox/Papers/Citti_Sarti_2006_A Cortical Based Model of Perceptual Completion in the Roto-Translation Space.pdf;/Users/fergalcotter/Zotero/storage/FTTFS74H/s10851-005-3630-2.html},
  note = {00224}
}

@incollection{flake_square_2012,
  langid = {english},
  title = {Square {{Unit Augmented}}, {{Radially Extended}}, {{Multilayer Perceptrons}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_11},
  abstract = {Consider a multilayer perceptron (MLP) with d inputs, a single hidden sigmoidal layer and a linear output. By adding an additional d inputs to the network with values set to the square of the first d inputs, properties reminiscent of higher-order neural networks and radial basis function networks (RBFN) are added to the architecture with little added expense in terms of weight requirements. Of particular interest, this architecture has the ability to form localized features in a d-dimensional space with a single hidden node but can also span large volumes of the input space; thus, the architecture has the localized properties of an RBFN but does not suffer as badly from the curse of dimensionality. I refer to a network of this type as a SQuare Unit Augmented, Radially Extended, MultiLayer Perceptron (SQUARE-MLP or SMLP).},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {143-161},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Flake, Gary William},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Flake_2012_Square Unit Augmented, Radially Extended, Multilayer Perceptrons.pdf;/Users/fergalcotter/Zotero/storage/W37U8RFM/978-3-642-35289-8_11.html},
  doi = {10.1007/978-3-642-35289-8_11},
  note = {00057}
}

@incollection{fritsch_applying_2012,
  langid = {english},
  title = {Applying {{Divide}} and {{Conquer}} to {{Large Scale Pattern Recognition Tasks}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_20},
  abstract = {Rather than presenting a specific trick, this paper aims at providing a methodology for large scale, real-world classification tasks involving thousands of classes and millions of training patterns. Such problems arise in speech recognition, handwriting recognition and speaker or writer identification, just to name a few. Given the typically very large number of classes to be distinguished, many approaches focus on parametric methods to independently estimate class conditional likelihoods. In contrast, we demonstrate how the principles of modularity and hierarchy can be applied to directly estimate posterior class probabilities in a connectionist framework. Apart from offering better discrimination capability, we argue that a hierarchical classification scheme is crucial in tackling the above mentioned problems. Furthermore, we discuss training issues that have to be addressed when an almost infinite amount of training data is available.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {311-338},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Fritsch, Jürgen and Finke, Michael},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Fritsch_Finke_2012_Applying Divide and Conquer to Large Scale Pattern Recognition Tasks.pdf;/Users/fergalcotter/Zotero/storage/3G97B542/978-3-642-35289-8_20.html},
  doi = {10.1007/978-3-642-35289-8_20},
  note = {00012}
}

@inproceedings{fauqueur_multiscale_2006,
  location = {{Atlanta, GA, USA}},
  title = {Multiscale Keypoint Detection Using the Dual-Tree Complex Wavelet Transform},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4106857},
  eventtitle = {2006 {{International Conference}} on {{Image Processing}} ({{ICIP}})},
  booktitle = {2006 {{International Conference}} on {{Image Processing}} ({{ICIP}})},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2006-10},
  pages = {1625--1628},
  author = {Fauqueur, Julien and Kingsbury, Nick and Anderson, Richard},
  file = {/Users/fergalcotter/Dropbox/Papers/Multiscale keypoint detection using the dual-tree complex wavelet transform.pdf},
  note = {00060}
}

@inproceedings{jarrett_what_2009,
  title = {What Is the Best Multi-Stage Architecture for Object Recognition?},
  doi = {10.1109/ICCV.2009.5459469},
  abstract = {In many recent object recognition systems, feature extraction stages are generally composed of a filter bank, a non-linear transformation, and some sort of feature pooling layer. Most systems use only one stage of feature extraction in which the filters are hard-wired, or two stages where the filters in one or both stages are learned in supervised or unsupervised mode. This paper addresses three questions: 1. How does the non-linearities that follow the filter banks influence the recognition accuracy? 2. does learning the filter banks in an unsupervised or supervised manner improve the performance over random filters or hardwired filters? 3. Is there any advantage to using an architecture with two stages of feature extraction, rather than one? We show that using non-linearities that include rectification and local contrast normalization is the single most important ingredient for good accuracy on object recognition benchmarks. We show that two stages of feature extraction yield better accuracy than one. Most surprisingly, we show that a two-stage system with random filters can yield almost 63\% recognition rate on Caltech-101, provided that the proper non-linearities and pooling layers are used. Finally, we show that with supervised refinement, the system achieves state-of-the-art performance on NORB dataset (5.6\%) and unsupervised pre-training followed by supervised refinement produces good accuracy on Caltech-101 ({$>$} 65\%), and the lowest known error rate on the undistorted, unprocessed MNIST dataset (0.53\%).},
  eventtitle = {2009 {{IEEE}} 12th {{International Conference}} on {{Computer Vision}}},
  booktitle = {2009 {{IEEE}} 12th {{International Conference}} on {{Computer Vision}}},
  date = {2009-09},
  pages = {2146-2153},
  keywords = {Brain modeling,Caltech-101,Error analysis,Gabor filters,Histograms,Image edge detection,Learning systems,NORB dataset,Refining,_tablet,feature pooling layer,feature rectification,filter bank,local contrast normalization,multistage architecture,nonlinear transformation,object recognition,supervised learning,unprocessed MNIST dataset,unsupervised learning,Feature extraction},
  author = {Jarrett, K. and Kavukcuoglu, K. and Ranzato, M. and LeCun, Y.},
  file = {/Users/fergalcotter/Dropbox/Papers/CNNs/Jarrett et al_2009_What is the best multi-stage architecture for object recognition.pdf;/Users/fergalcotter/Zotero/storage/5APEF4F7/articleDetails.html},
  note = {00640}
}

@incollection{plate_controlling_2012,
  langid = {english},
  title = {Controlling the {{Hyperparameter Search}} in {{MacKay}}’s {{Bayesian Neural Network Framework}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_7},
  abstract = {In order to achieve good generalization with neural networks overfitting must be controlled. Weight penalty factors are one common method of providing this control. However, using weight penalties creates the additional search problem of finding the optimal penalty factors. MacKay [5] proposed an approximate Bayesian framework for training neural networks, in which penalty factors are treated as hyperparameters and found in an iterative search. However, for classification networks trained with cross-entropy error, this search is slow and unstable, and it is not obvious how to improve it. This paper describes and compares several strategies for controlling this search. Some of these strategies greatly improve the speed and stability of the search. Test runs on a range of tasks are described.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {91-110},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Plate, Tony},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Plate_2012_Controlling the Hyperparameter Search in MacKay’s Bayesian Neural Network.pdf;/Users/fergalcotter/Zotero/storage/BK32DQZ3/978-3-642-35289-8_7.html},
  doi = {10.1007/978-3-642-35289-8_7},
  note = {00006}
}

@report{patrick_daly_natbib_2013,
  title = {Natbib - {{Natural Sciences Citations}} and {{References}}},
  date = {2013-09},
  author = {{Patrick Daly}},
  file = {/Users/fergalcotter/Dropbox/Papers/Patrick Daly_2013_Natbib - Natural Sciences Citations and References.pdf},
  note = {00002}
}

@article{chatfield_return_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1405.3531},
  primaryClass = {cs},
  title = {Return of the {{Devil}} in the {{Details}}: {{Delving Deep}} into {{Convolutional Nets}}},
  url = {http://arxiv.org/abs/1405.3531},
  shorttitle = {Return of the {{Devil}} in the {{Details}}},
  abstract = {The latest generation of Convolutional Neural Networks (CNN) have achieved impressive results in challenging benchmarks on image recognition and object detection, significantly raising the interest of the community in these methods. Nevertheless, it is still unclear how different CNN methods compare with each other and with previous state-of-the-art shallow representations such as the Bag-of-Visual-Words and the Improved Fisher Vector. This paper conducts a rigorous evaluation of these new techniques, exploring different deep architectures and comparing them on a common ground, identifying and disclosing important implementation details. We identify several useful properties of CNN-based representations, including the fact that the dimensionality of the CNN output layer can be reduced significantly without having an adverse effect on performance. We also identify aspects of deep and shallow methods that can be successfully shared. In particular, we show that the data augmentation techniques commonly applied to CNN-based methods can also be applied to shallow methods, and result in an analogous performance boost. Source code and models to reproduce the experiments in the paper is made publicly available.},
  urldate = {2016-08-07},
  date = {2014-05-14},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Chatfield, Ken and Simonyan, Karen and Vedaldi, Andrea and Zisserman, Andrew},
  file = {/Users/fergalcotter/Dropbox/Papers/Chatfield et al_2014_Return of the Devil in the Details.pdf;/Users/fergalcotter/Zotero/storage/KZKPEKFA/1405.html},
  note = {00495}
}

@article{kulkarni_deep_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1503.03167},
  primaryClass = {cs},
  title = {Deep {{Convolutional Inverse Graphics Network}}},
  url = {http://arxiv.org/abs/1503.03167},
  abstract = {This paper presents the Deep Convolution Inverse Graphics Network (DC-IGN), a model that learns an interpretable representation of images. This representation is disentangled with respect to transformations such as out-of-plane rotations and lighting variations. The DC-IGN model is composed of multiple layers of convolution and de-convolution operators and is trained using the Stochastic Gradient Variational Bayes (SGVB) algorithm. We propose a training procedure to encourage neurons in the graphics code layer to represent a specific transformation (e.g. pose or light). Given a single input image, our model can generate new images of the same object with variations in pose and lighting. We present qualitative and quantitative results of the model's efficacy at learning a 3D rendering engine.},
  urldate = {2016-02-02},
  date = {2015-03-11},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Graphics,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,_tablet},
  author = {Kulkarni, Tejas D. and Whitney, Will and Kohli, Pushmeet and Tenenbaum, Joshua B.},
  file = {/Users/fergalcotter/Dropbox/Papers/Kulkarni et al_2015_Deep Convolutional Inverse Graphics Network.pdf;/Users/fergalcotter/Zotero/storage/IWJIMMUX/1503.html},
  note = {00020}
}

@incollection{schraudolph_centering_2012,
  langid = {english},
  title = {Centering {{Neural Network Gradient Factors}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_14},
  abstract = {It has long been known that neural networks can learn faster when their input and hidden unit activities are centered about zero; recently we have extended this approach to also encompass the centering of error signals [15]. Here we generalize this notion to all factors involved in the network’s gradient, leading us to propose centering the slope of hidden unit activation functions as well. Slope centering removes the linear component of backpropagated error; this improves credit assignment in networks with shortcut connections. Benchmark results show that this can speed up learning significantly without adversely affecting the trained network’s generalization ability.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {205-223},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {Schraudolph, Nicol N.},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Schraudolph_2012_Centering Neural Network Gradient Factors.pdf;/Users/fergalcotter/Zotero/storage/4DZJCA6A/978-3-642-35289-8_14.html},
  doi = {10.1007/978-3-642-35289-8_14},
  note = {00025}
}

@article{zhang_variational_2015,
  title = {Variational {{Bayesian}} Image Restoration with Group-Sparse Modeling of Wavelet Coefficients},
  volume = {47},
  issn = {1051-2004},
  url = {http://www.sciencedirect.com/science/article/pii/S1051200415001438},
  doi = {10.1016/j.dsp.2015.04.011},
  abstract = {In this work, we present a recent wavelet-based image restoration framework based on a group-sparse Gaussian scale mixture model. A hierarchical Bayesian estimation is derived using a combination of variational Bayesian inference and a subband-adaptive majorization–minimization method that simplifies computation of the posterior distribution. We show that both of these iterative methods can converge together without needing nested loops, and thus good solutions can be found rapidly in the non-convex search space. We also integrate our method, variational Bayesian with majorization minimization (VBMM), with tree-structured modeling of the wavelet coefficients. This extension achieves significant gains in performance over the coefficient-sparse version of the algorithm. The experimental results demonstrate that the proposed method and its tree-structured extensions are effective for various imaging applications such as image deconvolution, image superresolution and compressive sensing magnetic resonance imaging (MRI) reconstruction, and that they outperform more conventional sparsity-inducing methods based on the l 1 -norm.},
  journaltitle = {Digital Signal Processing},
  shortjournal = {Digital Signal Processing},
  series = {Special {{Issue}} in {{Honour}} of {{William J}}. ({{Bill}}) {{Fitzgerald}}},
  urldate = {2016-08-05},
  date = {2015-12},
  pages = {157-168},
  keywords = {Dual-tree complex wavelets,Majorization minimization,Variational Bayesian inference,Wavelet group-sparse modeling,image restoration},
  author = {Zhang, Ganchi and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhang_Kingsbury_2015_Variational Bayesian image restoration with group-sparse modeling of wavelet.pdf;/Users/fergalcotter/Zotero/storage/BJZWPDEH/S1051200415001438.html},
  note = {00001}
}

@article{hinton_improving_2012,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1207.0580},
  primaryClass = {cs},
  title = {Improving Neural Networks by Preventing Co-Adaptation of Feature Detectors},
  url = {http://arxiv.org/abs/1207.0580},
  abstract = {When a large feedforward neural network is trained on a small training set, it typically performs poorly on held-out test data. This "overfitting" is greatly reduced by randomly omitting half of the feature detectors on each training case. This prevents complex co-adaptations in which a feature detector is only helpful in the context of several other specific feature detectors. Instead, each neuron learns to detect a feature that is generally helpful for producing the correct answer given the combinatorially large variety of internal contexts in which it must operate. Random "dropout" gives big improvements on many benchmark tasks and sets new records for speech and object recognition.},
  urldate = {2016-08-22},
  date = {2012-07-03},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Hinton, Geoffrey E. and Srivastava, Nitish and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Hinton et al_2012_Improving neural networks by preventing co-adaptation of feature detectors.pdf;/Users/fergalcotter/Zotero/storage/9TTU5NU7/1207.html},
  note = {01289}
}

@inproceedings{zhang_bayesian_2008,
  location = {{Vancouver, BC, Canada}},
  title = {A {{Bayesian}} Wavelet-Based Multidimensional Deconvolution with Sub-Band Emphasis},
  url = {http://sigproc.eng.cam.ac.uk/foswiki/pub/Main/NGK/Zhang_EMBC_2008.pdf},
  eventtitle = {30th {{Annual International Conference}} of the {{IEEE Engineering}} in {{Medicine}} and {{Biology Society}}},
  booktitle = {30th {{Annual International Conference}} of the {{IEEE Engineering}} in {{Medicine}} and {{Biology Society}}},
  urldate = {2015-11-03},
  date = {2008-08},
  pages = {3024--3027},
  author = {Zhang, Yingsong and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhang_Kingsbury_2008_A Bayesian wavelet-based multidimensional deconvolution with sub-band emphasis.pdf},
  note = {00007}
}

@article{torralba_statistics_2003,
  langid = {english},
  title = {Statistics of Natural Image Categories},
  volume = {14},
  issn = {0954-898X},
  abstract = {In this paper we study the statistical properties of natural images belonging to different categories and their relevance for scene and object categorization tasks. We discuss how second-order statistics are correlated with image categories, scene scale and objects. We propose how scene categorization could be computed in a feedforward manner in order to provide top-down and contextual information very early in the visual processing chain. Results show how visual categorization based directly on low-level features, without grouping or segmentation stages, can benefit object localization and identification. We show how simple image statistics can be used to predict the presence and absence of objects in the scene before exploring the image.},
  number = {3},
  journaltitle = {Network (Bristol, England)},
  shortjournal = {Network},
  date = {2003-08},
  pages = {391-412},
  keywords = {Nature,Photic Stimulation,Statistics as Topic},
  author = {Torralba, Antonio and Oliva, Aude},
  file = {/Users/fergalcotter/Dropbox/Papers/Torralba_Oliva_2003_Statistics of natural image categories.pdf},
  eprinttype = {pmid},
  eprint = {12938764}
}

@article{poggio_computational_2012,
  title = {The Computational Magic of the Ventral Stream: Sketch of a Theory (and Why Some Deep Architectures Work).},
  url = {http://dspace.mit.edu/handle/1721.1/76248},
  shorttitle = {The Computational Magic of the Ventral Stream},
  abstract = {This paper explores the theoretical consequences of a simple assumption: the computational goal of the feedforward path in the ventral stream -- from V1, V2, V4 and to IT -- is to discount image transformations, after learning them during development.},
  urldate = {2015-11-30},
  date = {2012-12-29},
  keywords = {Unread,Useful,_tablet},
  author = {Poggio, Tomaso and Mutch, Jim and Leibo, Joel and Rosasco, Lorenzo and Tacchetti, Andrea},
  file = {/Users/fergalcotter/Dropbox/Papers/Poggio et al_2012_The computational magic of the ventral stream.pdf;/Users/fergalcotter/Zotero/storage/G427TA9X/76248.html},
  note = {00018}
}

@article{turner_texture_????,
  langid = {english},
  title = {Texture Discrimination by {{Gabor}} Functions},
  volume = {55},
  issn = {0340-1200, 1432-0770},
  url = {http://link.springer.com/article/10.1007/BF00341922},
  doi = {10.1007/BF00341922},
  abstract = {A 2D Gabor filter can be realized as a sinusoidal plane wave of some frequency and orientation within a two dimensional Gaussian envelope. Its spatial extent, frequency and orientation preferences as well as bandwidths are easily controlled by the parameters used in generating the filters. However, there is an “uncertainty relation” associated with linear filters which limits the resolution simultaneously attainable in space and frequency. Daugman (1985) has determined that 2D Gabor filters are members of a class of functions achieving optimal joint resolution in the 2D space and 2D frequency domains. They have also been found to be a good model for two dimensional receptive fields of simple cells in the striate cortex (Jones 1985; Jones et al. 1985). The characteristic of optimal joint resolution in both space and frequency suggests that these filters are appropriate operators for tasks requiring simultaneous measurement in these domains. Texture discrimination is such a task. Computer application of a set of Gabor filters to a variety of textures found to be preattentively discriminable produces results in which differently textured regions are distinguished by firstorder differences in the values measured by the filters. This ability to reduce the statistical complexity distinguishing differently textured region as well as the sensitivity of these filters to certain types of local features suggest that Gabor functions can act as detectors of certain “texton” types. The performance of the computer models suggests that cortical neurons with Gabor like receptive fields may be involved in preattentive texture discrimination.},
  number = {2-3},
  journaltitle = {Biological Cybernetics},
  shortjournal = {Biol. Cybern.},
  urldate = {2016-07-31},
  pages = {71-82},
  author = {Turner, M. R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Turner_Texture discrimination by Gabor functions.pdf;/Users/fergalcotter/Zotero/storage/3JW63WEC/BF00341922.html},
  note = {00688}
}

@inproceedings{zeiler_adaptive_2011,
  title = {Adaptive Deconvolutional Networks for Mid and High Level Feature Learning},
  doi = {10.1109/ICCV.2011.6126474},
  abstract = {We present a hierarchical model that learns image decompositions via alternating layers of convolutional sparse coding and max pooling. When trained on natural images, the layers of our model capture image information in a variety of forms: low-level edges, mid-level edge junctions, high-level object parts and complete objects. To build our model we rely on a novel inference scheme that ensures each layer reconstructs the input, rather than just the output of the layer directly beneath, as is common with existing hierarchical approaches. This makes it possible to learn multiple layers of representation and we show models with 4 layers, trained on images from the Caltech-101 and 256 datasets. When combined with a standard classifier, features extracted from these models outperform SIFT, as well as representations from other feature learning methods.},
  eventtitle = {2011 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {2011 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  date = {2011-11},
  pages = {2018-2025},
  keywords = {Adaptation models,Caltech-101 datasets,Caltech-256 datasets,Computational modeling,Image reconstruction,Key Paper,Mathematical model,Switches,Training,adaptive deconvolutional networks,classifier,complete objects,convolutional sparse coding,deconvolution,hierarchical model,high level feature learning,high-level object parts,image classification,image decompositions,inference mechanisms,inference scheme,learning (artificial intelligence),low-level edges,max pooling,mid level feature learning,mid-level edge junctions,natural images,Feature extraction,Image representation},
  author = {Zeiler, M.D. and Taylor, G.W. and Fergus, R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Zeiler et al. - 2011 - Adaptive deconvolutional networks for mid and high.pdf;/Users/fergalcotter/Zotero/storage/AR3QUZ6T/abs_all.html},
  note = {00138}
}

@book{goodfellow_deep_2016,
  title = {Deep {{Learning}}},
  url = {http://www.deeplearningbook.org},
  publisher = {{MIT Press}},
  date = {2016},
  author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
  file = {/Users/fergalcotter/Dropbox/Papers/Goodfellow et al_2016_Deep Learning.pdf},
  note = {Book in preparation for MIT Press}
}

@article{mackay_practical_1992,
  langid = {english},
  title = {A {{Practical Bayesian Framework}} for {{Backpropagation Networks}}},
  volume = {4},
  issn = {0899-7667, 1530-888X},
  url = {http://www.mitpressjournals.org/doi/abs/10.1162/neco.1992.4.3.448},
  doi = {10.1162/neco.1992.4.3.448},
  number = {3},
  journaltitle = {Neural Computation},
  urldate = {2016-08-24},
  date = {1992-05},
  pages = {448-472},
  author = {MacKay, David J. C.},
  file = {/Users/fergalcotter/Dropbox/Papers/MacKay_1992_A Practical Bayesian Framework for Backpropagation Networks.pdf},
  note = {01735}
}

@article{koren_matrix_2009,
  title = {Matrix {{Factorization Techniques}} for {{Recommender Systems}}},
  volume = {42},
  issn = {0018-9162},
  doi = {10.1109/MC.2009.263},
  abstract = {As the Netflix Prize competition has demonstrated, matrix factorization models are superior to classic nearest neighbor techniques for producing product recommendations, allowing the incorporation of additional information such as implicit feedback, temporal effects, and confidence levels.},
  number = {8},
  journaltitle = {Computer},
  date = {2009-08},
  pages = {30-37},
  keywords = {Bioinformatics,Collaboration,Computational intelligence,Filtering,Genomics,Matrix factorization,Motion pictures,Nearest neighbor searches,Netflix Prize,Netflix Prize competition,Predictive models,Recommender systems,Sea measurements,information filtering,matrix decomposition,matrix factorization technique,nearest neighbor technique,product recommendation system,recommender system,retail data processing},
  author = {Koren, Y. and Bell, R. and Volinsky, C.},
  file = {/Users/fergalcotter/Dropbox/Papers/Koren et al_2009_Matrix Factorization Techniques for Recommender Systems_2.pdf;/Users/fergalcotter/Zotero/storage/JCZWZ85T/5197422.html},
  note = {02675}
}

@article{tinbergen_aims_1963,
  langid = {english},
  title = {On Aims and Methods of {{Ethology}}},
  volume = {20},
  issn = {1439-0310},
  url = {http://onlinelibrary.wiley.com/doi/10.1111/j.1439-0310.1963.tb01161.x/abstract},
  doi = {10.1111/j.1439-0310.1963.tb01161.x},
  abstract = {Ich habe in diesem Aufsatz kurz anzudeuten versucht, was meiner Ansicht nach das Wesentliche in Fragestellung und Methode der Ethologie ist und weshalb wir in Konrad Lorenz den Begründer moderner Ethologie erblicken. Hierbei habe ich vielleicht das Arbeitsgebiet der Ethologie weiter gefaßt, als unter Ethologen gebräuchlich ist. Wenn man aber die vielartige Arbeit jener Forscher übersieht, die sich Ethologen nennen, ist man zu dieser weiten Fassung geradezu gezwungen. Ich habe in meiner Darstellung weder Vollständigkeit noch Gleichgewicht angestrebt und, um zur Fortführung des Gesprächs anzuregen, ruhig meine Steckenpferde geritten, vor allem das Verhältnis zwischen Ethologie und Physiologie, die Gefahr der Vernachlässigung der Frage der Arterhaltung, Fragen der Methodik der ontogenetischen Forschung, und Aufgaben und Methoden der Evolutionsforschung. Bei der Einschätzung des Anteils, den Lorenz an der Entwicklung der Ethologie genommen hat und noch nimmt, habe ich als seinen Hauptbeitrag den bezeichnet, daß er uns gezeigt hat, wie man bewährtes “biologisches Denken” folgerichtig auf Verhalten anwenden kann. Daß er dabei an die Arbeit seiner Vorgänger angeknüpft hat, ist nicht mehr verwunderlich, als daß jeder Vater selbst einen Vater hat. Insbesondere scheint mir das Wesentliche an Lorenz‘ Arbeit zu sein, daß er klar gesehen hat, daß Verhaltensweisen Teile von “Organen”, von Systemen der Arterhaltung sind; daß ihre Verursachung genau so exakt untersucht werden kann wie die gleich welcher anderer Lebensvorgänge, daß ihr arterhaltender Wert ebenso systematisch und exakt aufweisbar ist wie ihre Verursachung, daß Verhaltensontogenie in grundsätzlich gleicher Weise erforscht werden kann wie die Ontogenie der Form und daß die Erforschung der Verhaltensevolution der Untersuchung der Strukturevolution parallel geht. Und obwohl Lorenz ein riesiges Tatsachenmaterial gesammelt hat, ist die Ethologie doch noch mehr durch seine Fragestellung und durch kühne Hypothesen gefördert als durch eigene Nachprüfung dieser Hypothesen. Ohne den Wert solcher Nachprüfung zu unterschätzen — ohne die es natürlich keine Weiterentwicklung gäbe — möchte ich doch behaupten, daß die durch Nachprüfung notwendig gewordenen Modifikationen neben der Leistung des ursprünglichen Ansatzes vergleichsweise unbedeutend sind. Nebenbei sei auch daran erinnert, daß eine der vielen heilsamen Nachwirkungen der Lorenzschen Arbeit das wachsende Interesse ist, das die Humanpsychologie der Ethologie entgegenbringt - ein erster Ansatz einer Entwicklung, deren Tragweite wir noch kaum übersehen können. Am Schluß noch eine Bemerkung zur Terminologie. Ich habe hier das Wort “Ethologie” auf einen Riesenkomplex von Wissenschaften angewandt, von denen manche, wie Psychologie und Physiologie, schon längst anerkannte Namen tragen. Das heißt natürlich nicht, daß ich den Namen Ethologie für dieses ganze Gebiet vorschlagen will; das wäre geschichtlich einfach falsch, weil das Wort historisch nur die Arbeit einer kleinen Gruppe von Zoologen kennzeichnet. Der Name ist natürlich gleichgültig; worauf es mir vor allem ankommt, ist darzutun, daß wir das Zusammenwachsen vieler Einzeldisziplinen zu einer vielumfassenden Wissenschaft erleben, für die es nur einen richtigen Namen gibt: “Verhaltensbiologie”. Selbstverständlich ist diese synthetische Entwicklung nicht die Arbeit eines Mannes oder gar die der Ethologen. Sie ist die Folge einer allgemeinen Neigung, Brücken zwischen verwandten Wissenschaften zu schlagen, einer Neigung, die sich in vielen Disziplinen entwickelt hat. Unter den Zoologen ist es Lorenz, der hierzu am meisten beigetragen und zudem manche Nachbardisziplinen stärker beeinflußt hat als irgendein anderer. Ich bin sogar davon überzeugt, daß diese Einwirkungen auf Nachbarwissenschaften noch lange anhalten werden und daß die Verhaltensbiologie erst am Anfang ihrer Ontogenie steht.},
  number = {4},
  journaltitle = {Zeitschrift für Tierpsychologie},
  urldate = {2016-07-11},
  date = {1963-01-12},
  pages = {410-433},
  author = {Tinbergen, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Tinbergen_1963_On aims and methods of Ethology.pdf;/Users/fergalcotter/Zotero/storage/DWJD2S7M/abstract.html},
  note = {02845}
}

@article{sweldens_lifting_1998,
  title = {The {{Lifting Scheme}}: {{A Construction}} of {{Second Generation Wavelets}}},
  volume = {29},
  issn = {0036-1410},
  url = {http://dx.doi.org/10.1137/S0036141095289051},
  doi = {10.1137/S0036141095289051},
  shorttitle = {The {{Lifting Scheme}}},
  number = {2},
  journaltitle = {SIAM J. Math. Anal.},
  urldate = {2016-09-28},
  date = {1998-03},
  pages = {511--546},
  keywords = {lifting scheme,multiresolution,second generation wavelet,wavelet},
  author = {Sweldens, Wim},
  file = {/Users/fergalcotter/Dropbox/Papers/Sweldens_1998_The Lifting Scheme.pdf},
  note = {02920}
}

@article{bengio_objective_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1509.05936},
  primaryClass = {cs, q-bio},
  title = {An Objective Function for {{STDP}}},
  url = {http://arxiv.org/abs/1509.05936},
  abstract = {We introduce a predictive objective function for the rate aspect of spike-timing dependent plasticity (STDP), i.e., ignoring the effects of synchrony of spikes but looking at spiking \{\textbackslash{}em rate changes\}. The proposed weight update is proportional to the presynaptic spiking (or firing) rate times the \{\textbackslash{}em temporal change\} of the integrated postsynaptic activity. We present an intuitive explanation for the relationship between spike-timing and weight change that arises when the weight change follows this rule. Spike-based simulations agree with the proposed relationship between spike timing and the temporal change of postsynaptic activity. They show a strong correlation between the biologically observed STDP behavior and the behavior obtained from simulations where the weight change follows the gradient of the predictive objective function.},
  urldate = {2016-02-22},
  date = {2015-09-19},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Quantitative Biology - Neurons and Cognition},
  author = {Bengio, Yoshua and Mesnard, Thomas and Fischer, Asja and Zhang, Saizheng and Wu, Yuhai},
  file = {/Users/fergalcotter/Dropbox/Papers/Bengio et al_2015_An objective function for STDP.pdf;/Users/fergalcotter/Zotero/storage/FUVMH9JE/1509.html},
  note = {00001}
}

@article{blakemore_development_1970,
  langid = {english},
  title = {Development of the {{Brain}} Depends on the {{Visual Environment}}},
  volume = {228},
  url = {http://www.nature.com/nature/journal/v228/n5270/abs/228477a0.html},
  doi = {10.1038/228477a0},
  number = {5270},
  journaltitle = {Nature},
  shortjournal = {Nature},
  urldate = {2016-07-27},
  date = {1970-10-31},
  pages = {477-478},
  author = {Blakemore, Colin and Cooper, Grahame F.},
  file = {/Users/fergalcotter/Dropbox/Papers/Blakemore_Cooper_1970_Development of the Brain depends on the Visual Environment.pdf;/Users/fergalcotter/Zotero/storage/RZWHKQBZ/228477a0.html},
  note = {01069}
}

@inproceedings{kingsbury_redundant_2003,
  location = {{Barcelona, Spain}},
  title = {Redundant Representation with Complex Wavelets: {{How}} to Achieve Sparsity},
  volume = {1},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1246894},
  shorttitle = {Redundant Representation with Complex Wavelets},
  eventtitle = {2003 {{International Conference}} on {{Image Processing}}},
  booktitle = {Proceedings of 2003 {{International Conference}} on {{Image Processing}}},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2003-09},
  pages = {I--45},
  author = {Kingsbury, Nick and Reeves, Tanya},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_Reeves_2003_Redundant representation with complex wavelets.pdf},
  note = {00034}
}

@article{lecun_gradient-based_1998,
  title = {Gradient-Based Learning Applied to Document Recognition},
  volume = {86},
  issn = {0018-9219},
  doi = {10.1109/5.726791},
  abstract = {Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day},
  number = {11},
  journaltitle = {Proceedings of the IEEE},
  date = {1998-11},
  pages = {2278-2324},
  keywords = {2D shape variability,Character recognition,Convolution,GTN,Hidden Markov models,Multi-layer neural network,Neural networks,Optical character recognition software,Optical computing,back-propagation,backpropagation,cheque reading,complex decision surface synthesis,convolutional neural network character recognizers,document recognition,document recognition systems,field extraction,gradient based learning technique,gradient-based learning,graph transformer networks,handwritten character recognition,handwritten digit recognition task,high-dimensional patterns,language modeling,machine learning,multilayer neural networks,multilayer perceptrons,multimodule systems,optical character recognition,pattern recognition,performance measure minimization,principal component analysis,segmentation recognition,Feature extraction},
  author = {Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
  file = {/Users/fergalcotter/Dropbox/Papers/Lecun et al_1998_Gradient-based learning applied to document recognition.pdf;/Users/fergalcotter/Zotero/storage/RVPHPIBG/abs_all.html},
  note = {05502}
}

@article{lecun_backpropagation_1989,
  title = {Backpropagation {{Applied}} to {{Handwritten Zip Code Recognition}}},
  volume = {1},
  issn = {0899-7667},
  doi = {10.1162/neco.1989.1.4.541},
  abstract = {The ability of learning networks to generalize can be greatly enhanced by providing constraints from the task domain. This paper demonstrates how such constraints can be integrated into a backpropagation network through the architecture of the network. This approach has been successfully applied to the recognition of handwritten zip code digits provided by the U.S. Postal Service. A single network learns the entire recognition operation, going from the normalized image of the character to the final classification.},
  number = {4},
  journaltitle = {Neural Computation},
  date = {1989-12},
  pages = {541-551},
  keywords = {Unread},
  author = {LeCun, Y and Boser, B and Denker, J and Henderson, D and Howard, R and Hubbard, W and Jackel, L},
  file = {/Users/fergalcotter/Dropbox/Papers/LeCun et al_1989_Backpropagation Applied to Handwritten Zip Code Recognition.pdf;/Users/fergalcotter/Dropbox/Papers/lecun-89e.pdf;/Users/fergalcotter/Zotero/storage/TAQBWUJD/freeabs_all.html},
  note = {01508}
}

@article{fei-fei_one-shot_2006,
  title = {One-Shot Learning of Object Categories},
  volume = {28},
  issn = {0162-8828},
  doi = {10.1109/TPAMI.2006.79},
  abstract = {Learning visual models of object categories notoriously requires hundreds or thousands of training examples. We show that it is possible to learn much information about a category from just one, or a handful, of images. The key insight is that, rather than learning from scratch, one can take advantage of knowledge coming from previously learned categories, no matter how different these categories might be. We explore a Bayesian implementation of this idea. Object categories are represented by probabilistic models. Prior knowledge is represented as a probability density function on the parameters of these models. The posterior model for an object category is obtained by updating the prior in the light of one or more observations. We test a simple implementation of our algorithm on a database of 101 diverse object categories. We compare category models learned by an implementation of our Bayesian approach to models learned from by maximum likelihood (ML) and maximum a posteriori (MAP) methods. We find that on a database of more than 100 categories, the Bayesian approach produces informative models when the number of training examples is too small for other methods to operate successfully.},
  number = {4},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {2006-04},
  pages = {594-611},
  keywords = {Algorithms,Artificial Intelligence,Automotive materials,Bayes Theorem,Bayes methods,Bayesian implementation,Bayesian methods,Cluster Analysis,Computer Simulation,Image Enhancement,Image Interpretation; Computer-Assisted,Image databases,Image recognition,Imaging; Three-Dimensional,Information Storage and Retrieval,Layout,Management training,Models; Biological,Models; Statistical,Pattern Recognition; Automated,Reproducibility of Results,Rough surfaces,Sensitivity and Specificity,Surface roughness,Taxonomy,Testing,Useful,few images,learning,learning (artificial intelligence),maximum a posteriori method,maximum likelihood method,object categories,object category posterior model,one-shot learning,priors.,probabilistic models,probability density function,recognition,statistical analysis,unsupervised,variational inference},
  author = {Fei-Fei, Li and Fergus, R. and Perona, P.},
  file = {/Users/fergalcotter/Dropbox/Papers/Fei-Fei et al_2006_One-shot learning of object categories.pdf;/Users/fergalcotter/Zotero/storage/7A6SKW7S/abs_all.html},
  note = {02005}
}

@incollection{sallee_learning_2003,
  title = {Learning {{Sparse Multiscale Image Representations}}},
  url = {http://papers.nips.cc/paper/2145-learning-sparse-multiscale-image-representations.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 15},
  publisher = {{MIT Press}},
  urldate = {2016-07-28},
  date = {2003},
  pages = {1351--1358},
  keywords = {Read Now},
  author = {Sallee, Phil and Olshausen, Bruno A.},
  editor = {Becker, S. and Thrun, S. and Obermayer, K.},
  file = {/Users/fergalcotter/Dropbox/Papers/Sallee_Olshausen_2003_Learning Sparse Multiscale Image Representations.pdf;/Users/fergalcotter/Zotero/storage/HQJN6Q5C/2145-learning-sparse-multiscale-image-representations.html},
  note = {00081}
}

@article{chia_office_2015,
  title = {Office Sitting Made Less Sedentary: {{A}} Future-Forward Approach to Reducing Physical Inactivity at Work},
  url = {https://repository.nie.edu.sg/handle/10497/17585},
  shorttitle = {Office Sitting Made Less Sedentary},
  urldate = {2016-06-19},
  date = {2015},
  author = {Chia, Michael and Chen, Bokai and Suppiah, Haresh T.},
  file = {C:\\Users\\fbc23\\Google Drive\\Papers\\Cords Papers\\Chia et al_2015_Office sitting made less sedentary.pdf;/Users/fergalcotter/Zotero/storage/DNVNFK5K/17585.html},
  note = {00001}
}

@thesis{yingsong_zhang_sparse_2011-1,
  title = {Sparse {{Reconstruction Algorithms}} and the {{Application}} in {{Image Processing}}},
  abstract = {This dissertation investigates the sparse-regularized linear inverse problem
and its applications in image deconvolution, interpolation and denoising
problems. Wavelets provide sparse representations of a wide
range of natural images and data. For this reason, we are interested
in applying the sparse regularization onto the wavelet coefficients of
images, particularly large images and 3D datasets which presents serious
challenges to the design of algorithms for signal recovery. The
conventional sparse-signal-recovery methods, have been traditionally
based on greedy heuristics (e.g. matching-pursuit based methods) or
convex relaxation of `0 (`1 minimization). Such methods become very
computational expensive when the dimensionality of the problem is
large.
In this dissertation, we proposed two algorithms to perform the sparsesignal
recovery efficiently and accurately on large images and 3D datasets.
SAIWave is specially designed for deconvolution. It is a Subband-
Adaptive and generalized version of the popular Iterative thresholding
algorithm that takes different update steps and thresholds for each
subband, which is shown to accelerate the convergence. The SAIWave
algorithm runs in parallel, updating all of the subbands at the same
time. We also give an algorithm for selecting the parameter for each
subband that decides the update steps and thresholds. The other algorithm
is L0RL2 . L0RL2 is developed for the purpose of general
sparse-signal recovery. We introduce a new penalty function, which
has some useful geometric properties with regard to the continuation
parameter ². These properties are then utilized to develop algorithms
(IRLS-² and L0RL2 ). Both algorithms are shown to recover sparse
signals with fewer measurements than the conventional methods, while
being efficient and accurate. L0RL2 is then combined with the development
of SAIWave to incorporate typical signal structures (group and
tree).
Finally, we consider two image applications of the L0RL2 algorithm.
The first one is to recover an image to a higher resolution than the observation
(super-resolution). The second proposes a simplified image
model to work with the L0RL2 algorithm on image denoising problems.
The proposed model adopts a hierarchical structure to describe
the multi-scale properties of wavelet coefficients. This is able to model
the non-Gaussian features of the wavelet coefficients’ marginal distributions.
In our experiments, the denoising result based on the proposed
model has the visual effect of improving the image sharpness.},
  pagetotal = {221},
  institution = {{University of Cambridge}},
  type = {PhD Thesis},
  date = {2011},
  author = {{Yingsong Zhang}},
  file = {/Users/fergalcotter/Dropbox/Papers/Thesis_zys_submitted.pdf},
  note = {00000}
}

@incollection{_introduction_????,
  title = {Introduction to {{Moments}}},
  file = {/Users/fergalcotter/Dropbox/Papers/Introduction to Moments.pdf},
  note = {00002}
}

@article{howard_improvements_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.5402},
  primaryClass = {cs},
  title = {Some {{Improvements}} on {{Deep Convolutional Neural Network Based Image Classification}}},
  url = {http://arxiv.org/abs/1312.5402},
  abstract = {We investigate multiple techniques to improve upon the current state of the art deep convolutional neural network based image classification pipeline. The techiques include adding more image transformations to training data, adding more transformations to generate additional predictions at test time and using complementary models applied to higher resolution images. This paper summarizes our entry in the Imagenet Large Scale Visual Recognition Challenge 2013. Our system achieved a top 5 classification error rate of 13.55\% using no external data which is over a 20\% relative improvement on the previous year's winner.},
  urldate = {2016-08-08},
  date = {2013-12-18},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Howard, Andrew G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Howard_2013_Some Improvements on Deep Convolutional Neural Network Based Image.pdf;/Users/fergalcotter/Zotero/storage/WSXN77IT/1312.html},
  note = {00036}
}

@report{davide_scaramuzza_sparse_2009,
  title = {Sparse {{Codes}} for {{Natural Images}}},
  abstract = {The human visual system, at the primary cortex, has
receptive fields that are spatially localized, oriented
and bandpass. It has been shown that a certain
learning algorithm to produce sparse codes for natural
images leads to basis functions with similar properties.
This learning algorithm optimizes a cost function that
trades off representation quality for sparseness, and
searches for sets of natural images, which basis
functions lead to good sparse approximations. The
result of the learning algorithm is a dictionary of basis
functions with localization in space, direction and
scale.
In this paper, dictionaries for different set of images
are showed and their own properties are described
and verified. It will be showed that the learning
algorithm leads to overcomplete bases functions that
“capture” the intrinsic structure of the images. This
allows efficient coding of the images with good
representation quality. The results are applied to
image approximation and denoising.},
  institution = {{EPFL}},
  date = {2009-07},
  author = {{Davide Scaramuzza}},
  file = {/Users/fergalcotter/Dropbox/Papers/Davide Scaramuzza_Sparse Codes for Natural Images.pdf},
  note = {00000}
}

@inproceedings{shaffrey_unsupervised_2002,
  title = {Unsupervised Image Segmentation via {{Markov}} Trees and Complex Wavelets},
  volume = {3},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1039093},
  booktitle = {Image {{Processing}}. 2002. {{Proceedings}}. 2002 {{International Conference}} On},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2002},
  pages = {801--804},
  author = {Shaffrey, Cián W. and Kingsbury, Nick G. and Jermyn, Ian H.},
  file = {/Users/fergalcotter/Dropbox/Papers/Shaffrey et al_2002_Unsupervised image segmentation via Markov trees and complex wavelets.pdf},
  note = {00042}
}

@article{field_what_1994,
  title = {What {{Is}} the {{Goal}} of {{Sensory Coding}}?},
  volume = {6},
  issn = {0899-7667},
  doi = {10.1162/neco.1994.6.4.559},
  abstract = {A number of recent attempts have been made to describe early sensory coding in terms of a general information processing strategy. In this paper, two strategies are contrasted. Both strategies take advantage of the redundancy in the environment to produce more effective representations. The first is described as a “compact” coding scheme. A compact code performs a transform that allows the input to be represented with a reduced number of vectors (cells) with minimal RMS error. This approach has recently become popular in the neural network literature and is related to a process called Principal Components Analysis (PCA). A number of recent papers have suggested that the optimal “compact” code for representing natural scenes will have units with receptive field profiles much like those found in the retina and primary visual cortex. However, in this paper, it is proposed that compact coding schemes are insufficient to account for the receptive field properties of cells in the mammalian visual pathway. In contrast, it is proposed that the visual system is near to optimal in representing natural scenes only if optimality is defined in terms of “sparse distributed” coding. In a sparse distributed code, all cells in the code have an equal response probability across the class of images but have a low response probability for any single image. In such a code, the dimensionality is not reduced. Rather, the redundancy of the input is transformed into the redundancy of the firing pattern of cells. It is proposed that the signature for a sparse code is found in the fourth moment of the response distribution (i.e., the kurtosis). In measurements with 55 calibrated natural scenes, the kurtosis was found to peak when the bandwidths of the visual code matched those of cells in the mammalian visual cortex. Codes resembling “wavelet transforms” are proposed to be effective because the response histograms of such codes are spar- e (i.e., show high kurtosis) when presented with natural scenes. It is proposed that the structure of the image that allows sparse coding is found in the phase spectrum of the image. It is suggested that natural scenes, to a first approximation, can be considered as a sum of self-similar local functions (the inverse of a wavelet). Possible reasons for why sensory systems would evolve toward sparse coding are presented.},
  number = {4},
  journaltitle = {Neural Computation},
  date = {1994-07},
  pages = {559-601},
  author = {Field, D. J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Field_1994_What Is the Goal of Sensory Coding.pdf;/Users/fergalcotter/Zotero/storage/3BW3J8NS/freeabs_all.html},
  note = {01262}
}

@article{srivastava_dropout:_2014,
  title = {Dropout: {{A Simple Way}} to {{Prevent Neural Networks}} from {{Overfitting}}},
  volume = {15},
  url = {http://jmlr.org/papers/v15/srivastava14a.html},
  journaltitle = {Journal of Machine Learning Research},
  date = {2014},
  pages = {1929-1958},
  author = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
  file = {/Users/fergalcotter/Dropbox/Papers/Srivastava et al_2014_Dropout.pdf},
  note = {00417}
}

@report{singh_deep_2015,
  location = {{Department of Engineering, Signal Processing}},
  title = {Deep {{Wavelet Networks}} for {{Object Recognition}}},
  institution = {{University of Cambridge}},
  type = {First Year PhD Report},
  date = {2015-12},
  keywords = {Unread},
  author = {Singh, Amarjot},
  file = {/Users/fergalcotter/Dropbox/Papers/Amarjot Singh_Deep Wavelet Networks for Object Recognition.pdf},
  note = {00000}
}

@article{olshausen_how_2005,
  langid = {english},
  title = {How Close Are We to Understanding V1?},
  volume = {17},
  issn = {0899-7667},
  doi = {10.1162/0899766054026639},
  abstract = {A wide variety of papers have reviewed what is known about the function of primary visual cortex. In this review, rather than stating what is known, we attempt to estimate how much is still unknown about V1 function. In particular, we identify five problems with the current view of V1 that stem largely from experimental and theoretical biases, in addition to the contributions of nonlinearities in the cortex that are not well understood. Our purpose is to open the door to new theories, a number of which we describe, along with some proposals for testing them.},
  number = {8},
  journaltitle = {Neural Computation},
  shortjournal = {Neural Comput},
  date = {2005-08},
  pages = {1665-1699},
  keywords = {Animals,Bias (Epidemiology),Humans,Models; Neurological,Neurons,Nonlinear Dynamics,Visual Pathways,visual cortex},
  author = {Olshausen, Bruno A. and Field, David J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Olshausen_Field_2005_How close are we to understanding v1.pdf},
  eprinttype = {pmid},
  eprint = {15969914},
  note = {00326}
}

@report{cochran_subfig_2005,
  title = {Subfig},
  abstract = {This article documents the LATEX package ‘subfig’, which provides support for the inclusion of small, ‘sub’, figures and tables. It simplifies the positioning, captioning and labeling of such objects within a single figure or table environment and to
continue a figure or table across multiple pages. In addition, this package allows such sub-captions to be written to a List-of-Floats page as desired. The ‘subfig’ package requires the ‘caption’ package by H.A. Sommerfeldt and replaces the older
‘subfigure’ package.},
  date = {2005-07},
  author = {Cochran, Steven},
  file = {/Users/fergalcotter/Dropbox/Papers/Cochran_2005_Subfig.pdf},
  note = {00003}
}

@article{raichle_two_2010,
  langid = {english},
  title = {Two Views of Brain Function},
  volume = {14},
  issn = {1879-307X},
  doi = {10.1016/j.tics.2010.01.008},
  abstract = {Traditionally studies of brain function have focused on task-evoked responses. By their very nature, such experiments tacitly encourage a reflexive view of brain function. Although such an approach has been remarkably productive, it ignores the alternative possibility that brain functions are mainly intrinsic, involving information processing for interpreting, responding to and predicting environmental demands. Here I argue that the latter view best captures the essence of brain function, a position that accords well with the allocation of the brain's energy resources. Recognizing the importance of intrinsic activity will require integrating knowledge from cognitive and systems neuroscience with cellular and molecular neuroscience where ion channels, receptors, components of signal transduction and metabolic pathways are all in a constant state of flux.},
  number = {4},
  journaltitle = {Trends in Cognitive Sciences},
  shortjournal = {Trends Cogn. Sci. (Regul. Ed.)},
  date = {2010-04},
  pages = {180-190},
  keywords = {Brain,Brain Mapping,Cognition,Energy Metabolism,Humans,Magnetic Resonance Imaging,Nerve Net,Neurons,Signal Transduction,neurophysiology},
  author = {Raichle, Marcus E.},
  file = {/Users/fergalcotter/Dropbox/Papers/Raichle Two views of brain function 2010.pdf},
  eprinttype = {pmid},
  eprint = {20206576},
  note = {00505}
}

@inproceedings{anderson_rotation-invariant_2006,
  location = {{Florence, Italy}},
  title = {Rotation-Invariant Object Recognition Using Edge Profile Clusters},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=7071517},
  eventtitle = {14th {{European Signal Processing Conference}}},
  booktitle = {14th {{European Signal Processing Conference}}},
  publisher = {{IEEE}},
  urldate = {2015-11-03},
  date = {2006-09},
  pages = {1--5},
  author = {Anderson, Ryan and Kingsbury, Nick and Fauqueur, Julien},
  file = {/Users/fergalcotter/Dropbox/Papers/Rotation-invariant object recognition using edge-profile clusters.pdf},
  note = {00010}
}

@article{hu_visual_1962,
  title = {Visual Pattern Recognition by Moment Invariants},
  volume = {8},
  issn = {0096-1000},
  doi = {10.1109/TIT.1962.1057692},
  abstract = {In this paper a theory of two-dimensional moment invariants for planar geometric figures is presented. A fundamental theorem is established to relate such moment invariants to the well-known algebraic invariants. Complete systems of moment invariants under translation, similitude and orthogonal transformations are derived. Some moment invariants under general two-dimensional linear transformations are also included. Both theoretical formulation and practical models of visual pattern recognition based upon these moment invariants are discussed. A simple simulation program together with its performance are also presented. It is shown that recognition of geometrical patterns and alphabetical characters independently of position, size and orientation can be accomplished. It is also indicated that generalization is possible to include invariance with parallel projection.},
  number = {2},
  journaltitle = {IRE Transactions on Information Theory},
  date = {1962-02},
  pages = {179-187},
  keywords = {Artificial Intelligence,Bibliographies,Character recognition,Decision theory,Distribution functions,Image analysis,Information processing,Information theory,Senior members,Shape,pattern recognition},
  author = {Hu, Ming-Kuei},
  file = {/Users/fergalcotter/Dropbox/Papers/Hu_1962_Visual pattern recognition by moment invariants.pdf;/Users/fergalcotter/Zotero/storage/33E8XWA5/abs_all.html},
  note = {08075}
}

@article{lennie_cost_2003,
  langid = {english},
  title = {The Cost of Cortical Computation},
  volume = {13},
  issn = {0960-9822},
  abstract = {Electrophysiological recordings show that individual neurons in cortex are strongly activated when engaged in appropriate tasks, but they tell us little about how many neurons might be engaged by a task, which is important to know if we are to understand how cortex encodes information. For human cortex, I estimate the cost of individual spikes, then, from the known energy consumption of cortex, I establish how many neurons can be active concurrently. The cost of a single spike is high, and this severely limits, possibly to fewer than 1\%, the number of neurons that can be substantially active concurrently. The high cost of spikes requires the brain not only to use representational codes that rely on very few active neurons, but also to allocate its energy resources flexibly among cortical regions according to task demand. The latter constraint explains the investment in local control of hemodynamics, exploited by functional magnetic resonance imaging, and the need for mechanisms of selective attention.},
  number = {6},
  journaltitle = {Current biology: CB},
  shortjournal = {Curr. Biol.},
  date = {2003-03-18},
  pages = {493-497},
  keywords = {Action Potentials,Attention,Brain Mapping,Cell Count,Energy Metabolism,Excitatory Postsynaptic Potentials,Hemodynamics,Humans,Linear Models,Models; Neurological,Neocortex,Neurons,Neurotransmitter Agents,Synaptic Transmission,Time Factors},
  author = {Lennie, Peter},
  eprinttype = {pmid},
  eprint = {12646132},
  note = {00627}
}

@inproceedings{kingsbury_dual-tree_1998-1,
  location = {{Rhodes, Greece}},
  title = {The Dual-Tree Complex Wavelet Transform: {{A}} New Efficient Tool for Image Restoration and Enhancement},
  shorttitle = {The Dual-Tree Complex Wavelet Transform},
  abstract = {A new implementation of the Discrete Wavelet Transform is presented for applications such as image restoration and enhancement. It employs a dual tree of wavelet filters to obtain the real and imaginary parts of the complex wavelet coefficients. This introduces limited redundancy (4 : 1 for 2-dimensional signals) and allows the transform to provide approximate shift invariance and directionally selective filters (properties lacking in the traditional wavelet transform) while preserving the usual properties of perfect reconstruction and computational efficiency. We show how the dual-tree complex wavelet transform can provide a good basis for multi-resolution image denoising and de-blurring.},
  eventtitle = {Signal {{Processing Conference}} ({{EUSIPCO}} 1998), 9th {{European}}},
  booktitle = {Signal {{Processing Conference}} ({{EUSIPCO}} 1998), 9th {{European}}},
  publisher = {{IEEE}},
  date = {1998-09},
  pages = {1-4},
  keywords = {2-dimensional signals,complex wavelet coefficients,Continuous wavelet transforms,directionally selective filters,discrete wavelet transform,Discrete wavelet transforms,dual tree,dual-tree complex wavelet transform,filtering theory,image denoising,Image Enhancement,Image reconstruction,image restoration,multiresolution image deblurring,multiresolution image denoising,shift invariance,Wavelet analysis,wavelet filters},
  author = {Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_1998_The dual-tree complex wavelet transform.pdf;/Users/fergalcotter/Zotero/storage/ADK2WINZ/articleDetails.html},
  note = {00345}
}

@article{mallat_multiresolution_1989,
  eprinttype = {jstor},
  eprint = {2001373},
  title = {Multiresolution {{Approximations}} and {{Wavelet Orthonormal Bases}} of {{L2}}({{R}})},
  volume = {315},
  issn = {0002-9947},
  doi = {10.2307/2001373},
  abstract = {A multiresolution approximation is a sequence of embedded vector spaces (Vj)j∈ Z for approximating L2(R) functions. We study the properties of a multiresolution approximation and prove that it is characterized by a 2π-periodic function which is further described. From any multiresolution approximation, we can derive a function ψ(x) called a wavelet such that \$(\textbackslash{}sqrt\{2\^j\}\textbackslash{}pi(2\^jx - k))\_\{(k,j)\textbackslash{}in Z\^2\}\$ is an orthonormal basis of L2(R). This provides a new approach for understanding and computing wavelet orthonormal bases. Finally, we characterize the asymptotic decay rate of multiresolution approximation errors for functions in a Sobolev space Hs.},
  number = {1},
  journaltitle = {Transactions of the American Mathematical Society},
  shortjournal = {Transactions of the American Mathematical Society},
  date = {1989-09-01},
  pages = {69-87},
  author = {Mallat, Stephane G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Mallat_1989_Multiresolution Approximations and Wavelet Orthonormal Bases of L2(R).pdf},
  note = {03220 (manual)}
}

@thesis{mark_miller_multiscale_2006,
  location = {{Department of Engineering, Signal Processing}},
  title = {Multiscale {{Techniques}} for {{Imaging Problems}}},
  abstract = {This thesis demonstrates the use of multiscale bases in solving inverse imaging problems. It uses a complex wavelet decomposition called the Duel Tree Complex Wavelet Transform (DT-CWT) which has a number of advantages over real wavelet 
transforms used in many image processing applications. A novel method for defining the statistics of 2-D photographic images useful in image estimation is defined. The new method is based on the DT-CWT but a phase rotation is applied to the coefficients to create complex coefficients whose phase is shift-invariant at multiscale edge and ridge features. This is in addition to the magnitude shift invariance achieved by the DT-CWT. Furthermore, the phase of the new coefficient is consistent for a particular type of feature in a given subband allowing more accurate estimation of the statistics of the subband from a degraded image.

These derotated coefficients are used to model multiscale edges and ridge features and are combined with state-of-the-art Gaussian Scale Mixture (GSM) denoising techniques to create a novel denoising algorithm. The additional modelling is combined with existing techniques using a Bayesian adaptive model selection framework. The algorithm succeeds in providing improved denoising performance at structural image features, reducing ringing artifacts and enhancing sharpness, while avoiding degradation in other areas. The method outperforms previously published methods visually and in standard tests.

The DT-CWT is also applied to the imaging of geological subsurface structures using marine seismic data collected on the sea surface. In standard reconstruction techniques, the reflectivity model parameters are defined as a grid of point scatterers over the area or volume of the subsurface to be imaged. We propose an approach to subsurface imaging using the DT-CWT as a basis for the reflectivity. This basis is used in conjunction with an iterative optimisation which frames the problem as a linearised inverse scattering problem. We demonstrate the method on synthetic data and a marine seismic data set acquired over the Gippsland Basin near Australia. The technique is shown to reduce noise and processing artifacts while preserving discontinuities. It is likely to be particularly useful in cases where the acquired data are incomplete.},
  pagetotal = {182},
  institution = {{University of Cambridge}},
  type = {PhD Thesis},
  date = {2006},
  author = {{Mark Miller}},
  file = {/Users/fergalcotter/Dropbox/Papers/Mark Miller_2006_Multiscale Techniques for Imaging Problems.pdf},
  note = {00002}
}

@article{bowmaker_visual_1980,
  title = {Visual Pigments of Rods and Cones in a Human Retina.},
  volume = {298},
  issn = {0022-3751},
  url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1279132/},
  abstract = {1. Microspectrophotometric measurements have been made of the photopigments of individual rods and cones from the retina of a man. The measuring beam was passed transversely through the isolated outer segments. 2. The mean absorbance spectrum for rods (n = 11) had a peak at 497.6 +/- 3.3 nm and the mean transverse absorbance was 0.035 +/- 0.007. 3. Three classes of cones were identified. The long-wave cones ('red' cones) had a lambda max of 562.8 +/- 4.7 nm (n = 19) with a mean transverse absorbance of 0.027 +/- 0.005. The middle-wave cones ('green' cones) had a lambda max of 533.8 +/- 3.7 nm (n = 11) with a mean transverse absorbance of 0.032 +/- 0.007. The short-wave cones ('blue' cones) had a lambda max of 420.3 +/- 4.7 nm (n = 3) with a mean transverse absorbance of 0.037 +/- 0.011. 4. If assumptions are made about the length of cones and about pre-receptoral absorption, it is possible to derive psychophysical sensitivities for the cones that closely resemble the appropriate pi mechanisms of W. S. Stiles. 5. If assumptions are made about the length of rods and about pre-receptoral absorption, however, the psychophysical sensitivity derived for the rods is considerably broader than the C.I.E. scotopic sensitivity function.},
  journaltitle = {The Journal of Physiology},
  shortjournal = {J Physiol},
  urldate = {2016-07-26},
  date = {1980-01},
  pages = {501-511},
  author = {Bowmaker, J K and Dartnall, H J},
  file = {/Users/fergalcotter/Dropbox/Papers/Bowmaker_Dartnall_1980_Visual pigments of rods and cones in a human retina.pdf},
  eprinttype = {pmid},
  eprint = {7359434},
  pmcid = {PMC1279132},
  note = {00385}
}

@article{gao_matrix_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1601.03805},
  primaryClass = {cs},
  title = {Matrix {{Neural Networks}}},
  url = {http://arxiv.org/abs/1601.03805},
  abstract = {Traditional neural networks assume vectorial inputs as the network is arranged as layers of single line of computing units called neurons. This special structure requires the non-vectorial inputs such as matrices to be converted into vectors. This process can be problematic. Firstly, the spatial information among elements of the data may be lost during vectorisation. Secondly, the solution space becomes very large which demands very special treatments to the network parameters and high computational cost. To address these issues, we propose matrix neural networks (MatNet), which takes matrices directly as inputs. Each neuron senses summarised information through bilinear mapping from lower layer units in exactly the same way as the classic feed forward neural networks. Under this structure, back prorogation and gradient descent combination can be utilised to obtain network parameters efficiently. Furthermore, it can be conveniently extended for multimodal inputs. We apply MatNet to MNIST handwritten digits classification and image super resolution tasks to show its effectiveness. Without too much tweaking MatNet achieves comparable performance as the state-of-the-art methods in both tasks with considerably reduced complexity.},
  urldate = {2016-01-25},
  date = {2016-01-14},
  keywords = {Computer Science - Learning},
  author = {Gao, Junbin and Guo, Yi and Wang, Zhiyong},
  file = {/Users/fergalcotter/Dropbox/Papers/Gao et al_2016_Matrix Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/7PVQSNW5/1601.html},
  note = {00000}
}

@inproceedings{bruna_classification_2011,
  location = {{Colorado Springs}},
  title = {Classification with Scattering Operators},
  doi = {10.1109/CVPR.2011.5995635},
  abstract = {A scattering vector is a local descriptor including multiscale and multi-direction co-occurrence information. It is computed with a cascade of wavelet decompositions and complex modulus. This scattering representation is locally translation invariant and linearizes deformations. A supervised classification algorithm is computed with a PCA model selection on scattering vectors. State of the art results are obtained for handwritten digit recognition and texture classification.},
  eventtitle = {2011 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2011 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2011-06},
  pages = {1561-1566},
  keywords = {complex modulus,Convolution,Databases,handwritten character recognition,handwritten digit recognition,image classification,image texture,learning (artificial intelligence),locally translation invariant,multidirection co-occurrence information,multiscale co-occurrence information,PCA model selection,principal component analysis,Scattering,scattering operators,scattering representation,scattering vector,supervised classification algorithm,texture classification,Training,wavelet decompositions,wavelet transforms},
  author = {Bruna, J. and Mallat, S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Bruna_Mallat_2011_Classification with scattering operators.pdf;/Users/fergalcotter/Zotero/storage/BZISGAQA/articleDetails.html},
  note = {00058}
}

@article{goodman_european_2017-1,
%  archivePrefix = {arXiv},
%  eprinttype = {arxiv},
%  eprint = {1606.08813},
  title = {European {{Union Regulations}} on {{Algorithmic Decision}}-{{Making}} and a "{{Right}} to {{Explanation}}"},
  url = {http://arxiv.org/abs/1606.08813},
  doi = {10.1609/aimag.v38i3.2741},
  abstract = {We summarize the potential impact that the European Union's new General Data Protection Regulation will have on the routine use of machine learning algorithms. Slated to take effect as law across the EU in 2018, it will restrict automated individual decision-making (that is, algorithms that make decisions based on user-level predictors) which "significantly affect" users. The law will also create a "right to explanation," whereby a user can ask for an explanation of an algorithmic decision that was made about them. We argue that while this law will pose large challenges for industry, it highlights opportunities for machine learning researchers to take the lead in designing algorithms and evaluation frameworks which avoid discrimination.},
  journaltitle = {AI Magazine},
  urldate = {2016-08-12},
  date = {2017},
  keywords = {Algorithm,Computer Science - Computers and Society,Computer Science - Learning,General Data Protection Regulation,Machine learning,Privacy,Statistics - Machine Learning,User space},
  author = {Goodman, Bryce and Flaxman, Seth},
  file = {/Users/fergalcotter/Dropbox/Papers/Goodman_Flaxman_2016_European Union regulations on algorithmic decision-making and a right to.pdf;/Users/fergalcotter/Zotero/storage/3ZNWH39P/Goodman and Flaxman - 2017 - European Union Regulations on Algorithmic Decision.pdf;/Users/fergalcotter/Zotero/storage/52TZI8TV/1606.html},
  note = {00000}
}

@article{hubel_receptive_1962,
  langid = {english},
  title = {Receptive Fields, Binocular Interaction and Functional Architecture in the Cat's Visual Cortex},
  volume = {160},
  issn = {0022-3751},
  journaltitle = {The Journal of Physiology},
  shortjournal = {J. Physiol. (Lond.)},
  date = {1962-01},
  pages = {106-154},
  keywords = {CEREBRAL CORTEX/physiology,Cerebral Cortex,Useful},
  author = {Hubel, D. H. and Wiesel, T. N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Hubel_Wiesel_1962_Receptive fields, binocular interaction and functional architecture in the.pdf},
  note = {10058}
}

@article{mishkin_all_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1511.06422},
  primaryClass = {cs},
  title = {All You Need Is a Good Init},
  url = {http://arxiv.org/abs/1511.06422},
  abstract = {Layer-sequential unit-variance (LSUV) initialization - a simple method for weight initialization for deep net learning - is proposed. The method consists of the two steps. First, pre-initialize weights of each convolution or inner-product layer with orthonormal matrices. Second, proceed from the first to the final layer, normalizing the variance of the output of each layer to be equal to one. Experiment with different activation functions (maxout, ReLU-family, tanh) show that the proposed initialization leads to learning of very deep nets that (i) produces networks with test accuracy better or equal to standard methods and (ii) is at least as fast as the complex schemes proposed specifically for very deep nets such as FitNets (Romero et al. (2015)) and Highway (Srivastava et al. (2015)). Performance is evaluated on GoogLeNet, CaffeNet, FitNets and Residual nets and the state-of-the-art, or very close to it, is achieved on the MNIST, CIFAR-10/100 and ImageNet datasets.},
  urldate = {2016-08-09},
  date = {2015-11-19},
  keywords = {Computer Science - Learning},
  author = {Mishkin, Dmytro and Matas, Jiri},
  file = {/Users/fergalcotter/Dropbox/Papers/Mishkin_Matas_2015_All you need is a good init.pdf;/Users/fergalcotter/Zotero/storage/35DWH2Q8/1511.html},
  note = {00012}
}

@incollection{sanger_optimality_1989,
  title = {An {{Optimality Principle}} for {{Unsupervised Learning}}},
  url = {http://papers.nips.cc/paper/139-an-optimality-principle-for-unsupervised-learning.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 1},
  publisher = {{Morgan-Kaufmann}},
  urldate = {2016-07-27},
  date = {1989},
  pages = {11--19},
  author = {Sanger, Terence D.},
  editor = {Touretzky, D. S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Sanger_1989_An Optimality Principle for Unsupervised Learning.pdf;/Users/fergalcotter/Zotero/storage/XNJ5TDC6/139-an-optimality-principle-for-unsupervised-learning.html},
  note = {00163}
}

@inproceedings{mahendran_understanding_2015,
  location = {{Boston, MA, USA}},
  title = {Understanding {{Deep Image Representations}} by {{Inverting Them}}},
  url = {http://arxiv.org/abs/1412.0035},
  abstract = {Image representations, from SIFT and Bag of Visual Words to Convolutional Neural Networks (CNNs), are a crucial component of almost any image understanding system. Nevertheless, our understanding of them remains limited. In this paper we conduct a direct analysis of the visual information contained in representations by asking the following question: given an encoding of an image, to which extent is it possible to reconstruct the image itself? To answer this question we contribute a general framework to invert representations. We show that this method can invert representations such as HOG and SIFT more accurately than recent alternatives while being applicable to CNNs too. We then use this technique to study the inverse of recent state-of-the-art CNN image representations for the first time. Among our findings, we show that several layers in CNNs retain photographically accurate information about the image, with different degrees of geometric and photometric invariance.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2015-12-01},
  date = {2015-06},
  keywords = {_tablet,Computer Science - Computer Vision and Pattern Recognition,Key Paper,Unread},
  author = {Mahendran, Aravindh and Vedaldi, Andrea},
  file = {/Users/fergalcotter/Dropbox/Papers/Mahendran_Vedaldi_2015_Understanding Deep Image Representations by Inverting Them.pdf;/Users/fergalcotter/Zotero/storage/N3SBS2D7/1412.html},
  note = {00042}
}

@book{tor_norretranders_user_1998,
  title = {The {{User Illusion}}},
  url = {http://www.goodreads.com/work/best_book/1375359-m-rk-verden-en-beretning-om-bevidsthed},
  abstract = {As John Casti wrote, "Finally, a book that really does explain consciousness." This groundbreaking work by Denmark's leading science writ...},
  publisher = {{Viking}},
  urldate = {2016-07-26},
  date = {1998},
  author = {{Tor Nørretranders}},
  file = {/Users/fergalcotter/Zotero/storage/BQDC4J5T/106732.html},
  note = {00728}
}

@article{goodfellow_explaining_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1412.6572},
  primaryClass = {cs, stat},
  title = {Explaining and {{Harnessing Adversarial Examples}}},
  url = {http://arxiv.org/abs/1412.6572},
  abstract = {Several machine learning models, including neural networks, consistently misclassify adversarial examples---inputs formed by applying small but intentionally worst-case perturbations to examples from the dataset, such that the perturbed input results in the model outputting an incorrect answer with high confidence. Early attempts at explaining this phenomenon focused on nonlinearity and overfitting. We argue instead that the primary cause of neural networks' vulnerability to adversarial perturbation is their linear nature. This explanation is supported by new quantitative results while giving the first explanation of the most intriguing fact about them: their generalization across architectures and training sets. Moreover, this view yields a simple and fast method of generating adversarial examples. Using this approach to provide examples for adversarial training, we reduce the test set error of a maxout network on the MNIST dataset.},
  urldate = {2016-08-24},
  date = {2014-12-19},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Goodfellow, Ian J. and Shlens, Jonathon and Szegedy, Christian},
  file = {/Users/fergalcotter/Dropbox/Papers/Goodfellow et al_2014_Explaining and Harnessing Adversarial Examples.pdf;/Users/fergalcotter/Zotero/storage/WCKBT46V/1412.html},
  note = {00109}
}

@article{shang_understanding_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1603.05201},
  primaryClass = {cs},
  title = {Understanding and {{Improving Convolutional Neural Networks}} via {{Concatenated Rectified Linear Units}}},
  url = {http://arxiv.org/abs/1603.05201},
  abstract = {Recently, convolutional neural networks (CNNs) have been used as a powerful tool to solve many problems of machine learning and computer vision. In this paper, we aim to provide insight on the property of convolutional neural networks, as well as a generic method to improve the performance of many CNN architectures. Specifically, we first examine existing CNN models and observe an intriguing property that the filters in the lower layers form pairs (i.e., filters with opposite phase). Inspired by our observation, we propose a novel, simple yet effective activation scheme called concatenated ReLU (CRelu) and theoretically analyze its reconstruction property in CNNs. We integrate CRelu into several state-of-the-art CNN architectures and demonstrate improvement in their recognition performance on CIFAR-10/100 and ImageNet datasets with fewer trainable parameters. Our results suggest that better understanding of the properties of CNNs can lead to significant performance improvement with a simple modification.},
  urldate = {2016-08-09},
  date = {2016-03-16},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning},
  author = {Shang, Wenling and Sohn, Kihyuk and Almeida, Diogo and Lee, Honglak},
  file = {/Users/fergalcotter/Dropbox/Papers/Shang et al_2016_Understanding and Improving Convolutional Neural Networks via Concatenated.pdf;/Users/fergalcotter/Zotero/storage/PGW3I37S/1603.html},
  note = {00000}
}

@article{waldspurger_phase_2012,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1206.0102},
  primaryClass = {math},
  title = {Phase {{Recovery}}, {{MaxCut}} and {{Complex Semidefinite Programming}}},
  url = {http://arxiv.org/abs/1206.0102},
  abstract = {Phase retrieval seeks to recover a signal x from the amplitude |Ax| of linear measurements. We cast the phase retrieval problem as a non-convex quadratic program over a complex phase vector and formulate a tractable relaxation (called PhaseCut) similar to the classical MaxCut semidefinite program. We solve this problem using a provably convergent block coordinate descent algorithm whose structure is similar to that of the original greedy algorithm in Gerchberg-Saxton, where each iteration is a matrix vector product. Numerical results show the performance of this approach over three different phase retrieval problems, in comparison with greedy phase retrieval algorithms and matrix completion formulations.},
  urldate = {2016-02-01},
  date = {2012-06-01},
  keywords = {Mathematics - Optimization and Control,Unread},
  author = {Waldspurger, Irène and d' Aspremont, Alexandre and Mallat, Stéphane},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Dropbox/Papers/Waldspurger et al_2012_Phase Recovery, MaxCut and Complex Semidefinite Programming.pdf;/Users/fergalcotter/Zotero/storage/9SVAKHTF/1206.html},
  note = {00125}
}

@article{hubel_receptive_1968,
  langid = {english},
  title = {Receptive Fields and Functional Architecture of Monkey Striate Cortex},
  volume = {195},
  issn = {0022-3751},
  abstract = {1. The striate cortex was studied in lightly anaesthetized macaque and spider monkeys by recording extracellularly from single units and stimulating the retinas with spots or patterns of light. Most cells can be categorized as simple, complex, or hypercomplex, with response properties very similar to those previously described in the cat. On the average, however, receptive fields are smaller, and there is a greater sensitivity to changes in stimulus orientation. A small proportion of the cells are colour coded.2. Evidence is presented for at least two independent systems of columns extending vertically from surface to white matter. Columns of the first type contain cells with common receptive-field orientations. They are similar to the orientation columns described in the cat, but are probably smaller in cross-sectional area. In the second system cells are aggregated into columns according to eye preference. The ocular dominance columns are larger than the orientation columns, and the two sets of boundaries seem to be independent.3. There is a tendency for cells to be grouped according to symmetry of responses to movement; in some regions the cells respond equally well to the two opposite directions of movement of a line, but other regions contain a mixture of cells favouring one direction and cells favouring the other.4. A horizontal organization corresponding to the cortical layering can also be discerned. The upper layers (II and the upper two-thirds of III) contain complex and hypercomplex cells, but simple cells are virtually absent. The cells are mostly binocularly driven. Simple cells are found deep in layer III, and in IV A and IV B. In layer IV B they form a large proportion of the population, whereas complex cells are rare. In layers IV A and IV B one finds units lacking orientation specificity; it is not clear whether these are cell bodies or axons of geniculate cells. In layer IV most cells are driven by one eye only; this layer consists of a mosaic with cells of some regions responding to one eye only, those of other regions responding to the other eye. Layers V and VI contain mostly complex and hypercomplex cells, binocularly driven.5. The cortex is seen as a system organized vertically and horizontally in entirely different ways. In the vertical system (in which cells lying along a vertical line in the cortex have common features) stimulus dimensions such as retinal position, line orientation, ocular dominance, and perhaps directionality of movement, are mapped in sets of superimposed but independent mosaics. The horizontal system segregates cells in layers by hierarchical orders, the lowest orders (simple cells monocularly driven) located in and near layer IV, the higher orders in the upper and lower layers.},
  number = {1},
  journaltitle = {The Journal of Physiology},
  shortjournal = {J. Physiol. (Lond.)},
  date = {1968-03},
  pages = {215-243},
  keywords = {Animals,Color Perception,Evoked Potentials,Haplorhini,Light,Motion Perception,Occipital Lobe,Retina,Vision; Ocular,Visual Fields},
  author = {Hubel, D. H. and Wiesel, T. N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Hubel_Wiesel_1968_Receptive fields and functional architecture of monkey striate cortex.pdf},
  eprinttype = {pmid},
  eprint = {4966457},
  pmcid = {PMC1557912},
  note = {04955}
}

@article{grossmann_decomposition_1984,
  title = {Decomposition of {{Hardy Functions}} into {{Square Integrable Wavelets}} of {{Constant Shape}}},
  volume = {15},
  issn = {0036-1410},
  url = {http://epubs.siam.org/doi/abs/10.1137/0515056},
  doi = {10.1137/0515056},
  abstract = {An arbitrary square integrable real-valued function (or, equivalently, the associated Hardy function) can be conveniently analyzed into a suitable family of square integrable wavelets of constant shape, (i.e. obtained by shifts and dilations from any one of them.) The resulting integral transform is isometric and self-reciprocal if the wavelets satisfy an “admissibility condition” given here. Explicit expressions are obtained in the case of a particular analyzing family that plays a role analogous to that of coherent states (Gabor wavelets) in the usual \$L\_2 \$ -theory. They are written in terms of a modified \$\textbackslash{}Gamma \$-function that is introduced and studied. From the point of view of group theory, this paper is concerned with square integrable coefficients of an irreducible representation of the nonunimodular \$ax + b\$-group.},
  number = {4},
  journaltitle = {SIAM Journal on Mathematical Analysis},
  shortjournal = {SIAM J. Math. Anal.},
  urldate = {2016-08-05},
  date = {1984-07-01},
  pages = {723-736},
  author = {Grossmann, A. and Morlet, J.},
  file = {/Users/fergalcotter/Zotero/storage/WAPVVHSJ/0515056.html},
  note = {03274}
}

@incollection{sgallari_scale_2007,
  langid = {english},
  location = {{Berlin, Heidelberg}},
  title = {Scale {{Spaces}} on {{Lie Groups}}},
  volume = {4485},
  isbn = {978-3-540-72822-1 978-3-540-72823-8},
  url = {http://link.springer.com/10.1007/978-3-540-72823-8_26},
  booktitle = {Scale {{Space}} and {{Variational Methods}} in {{Computer Vision}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-03},
  date = {2007},
  pages = {300-312},
  author = {Duits, Remco and Burgeth, Bernhard},
  editor = {Sgallari, Fiorella and Murli, Almerico and Paragios, Nikos},
  file = {/Users/fergalcotter/Dropbox/Papers/Duits_Burgeth_2007_Scale Spaces on Lie Groups.pdf},
  note = {00000}
}

@inproceedings{lowe_object_1999,
  title = {Object Recognition from Local Scale-Invariant Features},
  volume = {2},
  doi = {10.1109/ICCV.1999.790410},
  abstract = {An object recognition system has been developed that uses a new class of local image features. The features are invariant to image scaling, translation, and rotation, and partially invariant to illumination changes and affine or 3D projection. These features share similar properties with neurons in inferior temporal cortex that are used for object recognition in primate vision. Features are efficiently detected through a staged filtering approach that identifies stable points in scale space. Image keys are created that allow for local geometric deformations by representing blurred image gradients in multiple orientation planes and at multiple scales. The keys are used as input to a nearest neighbor indexing method that identifies candidate object matches. Final verification of each match is achieved by finding a low residual least squares solution for the unknown model parameters. Experimental results show that robust object recognition can be achieved in cluttered partially occluded images with a computation time of under 2 seconds},
  eventtitle = {The {{Proceedings}} of the {{Seventh IEEE International Conference}} on {{Computer Vision}}, 1999},
  booktitle = {The {{Proceedings}} of the {{Seventh IEEE International Conference}} on {{Computer Vision}}, 1999},
  date = {1999},
  pages = {1150-1157 vol.2},
  keywords = {3D projection,Computer science,Electrical capacitance tomography,Filters,Image recognition,Layout,Lighting,Neurons,Programmable logic arrays,Reactive power,blurred image gradients,candidate object matches,cluttered partially occluded images,computation time,computational geometry,image matching,inferior temporal cortex,least squares approximations,local geometric deformations,local image features,local scale-invariant features,low residual least squares solution,multiple orientation planes,nearest neighbor indexing method,object recognition,primate vision,robust object recognition,staged filtering approach,unknown model parameters,Feature extraction},
  author = {Lowe, D. G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Lowe_1999_Object recognition from local scale-invariant features.pdf;/Users/fergalcotter/Zotero/storage/5HF8M2MV/abs_all.html},
  note = {11668}
}

@incollection{rustamov_wavelets_2013,
  title = {Wavelets on {{Graphs}} via {{Deep Learning}}},
  url = {http://papers.nips.cc/paper/5046-wavelets-on-graphs-via-deep-learning.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 26},
  publisher = {{Curran Associates, Inc.}},
  urldate = {2016-09-28},
  date = {2013},
  pages = {998--1006},
  author = {Rustamov, Raif and Guibas, Leonidas J},
  editor = {Burges, C. J. C. and Bottou, L. and Welling, M. and Ghahramani, Z. and Weinberger, K. Q.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rustamov_Guibas_2013_Wavelets on Graphs via Deep Learning.pdf;/Users/fergalcotter/Zotero/storage/QTBMRX87/5046-wavelets-on-graphs-via-deep-learning.html},
  note = {00023}
}

@inproceedings{bengio_greedy_2007,
  title = {Greedy {{Layer}}-{{Wise Training}} of {{Deep Networks}}},
  url = {http://www.iro.umontreal.ca/ lisa/pointeurs/BengioNips2006All.pdf},
  abstract = {Complexity theory of circuits strongly suggests that deep architectures can be much more efficient (sometimes exponentially) than shallow architectures, in terms of computational elements required to represent some functions. Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization appears to often get stuck in poor solutions. Hinton et al. recently introduced a greedy layer-wise unsupervised learning algorithm for Deep Belief Networks (DBN), a generative model with many layers of hidden causal variables. In the context of the above optimization problem, we study this algorithm empirically and explore variants to better understand its success and extend it to cases where the inputs are continuous or where the structure of the input distribution is not revealing enough about the variable to be predicted in a supervised task. Our experiments also confirm the hypothesis that the greedy layer-wise unsupervised training strategy mostly helps the optimization, by initializing weights in a region near a good local minimum, giving rise to internal distributed representations that are high-level abstractions of the input, bringing better generalization.},
  date = {2007},
  pages = {153--160},
  author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
  file = {/Users/fergalcotter/Dropbox/Papers/Bengio et al_2007_Greedy Layer-Wise Training of Deep Networks.pdf},
  note = {01545}
}

@article{socher_zero-shot_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1301.3666},
  primaryClass = {cs},
  title = {Zero-{{Shot Learning Through Cross}}-{{Modal Transfer}}},
  url = {http://arxiv.org/abs/1301.3666},
  abstract = {This work introduces a model that can recognize objects in images even if no training data is available for the objects. The only necessary knowledge about the unseen categories comes from unsupervised large text corpora. In our zero-shot framework distributional information in language can be seen as spanning a semantic basis for understanding what objects look like. Most previous zero-shot learning models can only differentiate between unseen classes. In contrast, our model can both obtain state of the art performance on classes that have thousands of training images and obtain reasonable performance on unseen classes. This is achieved by first using outlier detection in the semantic space and then two separate recognition models. Furthermore, our model does not require any manually defined semantic features for either words or images.},
  urldate = {2016-02-25},
  date = {2013-01-16},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Read Now,Useful},
  author = {Socher, Richard and Ganjoo, Milind and Sridhar, Hamsa and Bastani, Osbert and Manning, Christopher D. and Ng, Andrew Y.},
  file = {/Users/fergalcotter/Dropbox/Papers/Socher et al_2013_Zero-Shot Learning Through Cross-Modal Transfer.pdf;/Users/fergalcotter/Zotero/storage/78VPFZ2V/1301.html},
  note = {00100}
}

@article{noh_learning_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1505.04366},
  primaryClass = {cs},
  title = {Learning {{Deconvolution Network}} for {{Semantic Segmentation}}},
  url = {http://arxiv.org/abs/1505.04366},
  abstract = {We propose a novel semantic segmentation algorithm by learning a deconvolution network. We learn the network on top of the convolutional layers adopted from VGG 16-layer net. The deconvolution network is composed of deconvolution and unpooling layers, which identify pixel-wise class labels and predict segmentation masks. We apply the trained network to each proposal in an input image, and construct the final semantic segmentation map by combining the results from all proposals in a simple manner. The proposed algorithm mitigates the limitations of the existing methods based on fully convolutional networks by integrating deep deconvolution network and proposal-wise prediction; our segmentation method typically identifies detailed structures and handles objects in multiple scales naturally. Our network demonstrates outstanding performance in PASCAL VOC 2012 dataset, and we achieve the best accuracy (72.5\%) among the methods trained with no external data through ensemble with the fully convolutional network.},
  urldate = {2016-10-28},
  date = {2015-05-17},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Noh, Hyeonwoo and Hong, Seunghoon and Han, Bohyung},
  file = {/Users/fergalcotter/Dropbox/Papers/Noh et al_2015_Learning Deconvolution Network for Semantic Segmentation.pdf;/Users/fergalcotter/Zotero/storage/Q7VPUBXD/1505.html},
  note = {00136}
}

@article{goodfellow_generative_2014-2,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1406.2661},
  primaryClass = {cs, stat},
  title = {Generative {{Adversarial Networks}}},
  url = {http://arxiv.org/abs/1406.2661},
  abstract = {We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to 1/2 everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.},
  urldate = {2016-02-02},
  date = {2014-06-10},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Goodfellow, Ian J. and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
  file = {/Users/fergalcotter/Dropbox/Papers/Goodfellow et al_2014_Generative Adversarial Networks.pdf;/Users/fergalcotter/Zotero/storage/XHXG68C8/1406.html},
  note = {00000}
}

@article{lin_network_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.4400},
  primaryClass = {cs},
  title = {Network {{In Network}}},
  url = {http://arxiv.org/abs/1312.4400},
  abstract = {We propose a novel deep network structure called "Network In Network" (NIN) to enhance model discriminability for local patches within the receptive field. The conventional convolutional layer uses linear filters followed by a nonlinear activation function to scan the input. Instead, we build micro neural networks with more complex structures to abstract the data within the receptive field. We instantiate the micro neural network with a multilayer perceptron, which is a potent function approximator. The feature maps are obtained by sliding the micro networks over the input in a similar manner as CNN; they are then fed into the next layer. Deep NIN can be implemented by stacking mutiple of the above described structure. With enhanced local modeling via the micro network, we are able to utilize global average pooling over feature maps in the classification layer, which is easier to interpret and less prone to overfitting than traditional fully connected layers. We demonstrated the state-of-the-art classification performances with NIN on CIFAR-10 and CIFAR-100, and reasonable performances on SVHN and MNIST datasets.},
  urldate = {2016-04-18},
  date = {2013-12-16},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Lin, Min and Chen, Qiang and Yan, Shuicheng},
  file = {/Users/fergalcotter/Dropbox/Papers/Lin et al_2013_Network In Network.pdf;/Users/fergalcotter/Zotero/storage/3Q3DA52E/1312.html},
  note = {01717}
}

@incollection{smagt_solving_2012,
  langid = {english},
  title = {Solving the {{Ill}}-{{Conditioning}} in {{Neural Network Learning}}},
  isbn = {978-3-642-35288-1 978-3-642-35289-8},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-35289-8_13},
  abstract = {In this paper we investigate the feed-forward learning problem. The well-known ill-conditioning which is present in most feed-forward learning problems is shown to be the result of the structure of the network. Also, the well-known problem that weights between ‘higher’ layers in the network have to settle before ‘lower’ weights can converge is addressed. We present a solution to these problems by modifying the structure of the network through the addition of linear connections which carry shared weights. We call the new network structure the linearly augmented feed-forward network, and it is shown that the universal approximation theorems are still valid. Simulation experiments show the validity of the new method, and demonstrate that the new network is less sensitive to local minima and learns faster than the original network.},
  number = {7700},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2016-08-09},
  date = {2012},
  pages = {191-203},
  keywords = {Algorithm Analysis and Problem Complexity,Artificial Intelligence (incl. Robotics),Complexity,Computation by Abstract Devices,Information Systems Applications (incl. Internet),pattern recognition},
  author = {van der Smagt, Patrick and Hirzinger, Gerd},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Smagt_Hirzinger_2012_Solving the Ill-Conditioning in Neural Network Learning.pdf;/Users/fergalcotter/Zotero/storage/K5DUPI2F/978-3-642-35289-8_13.html},
  doi = {10.1007/978-3-642-35289-8_13},
  note = {00024}
}

@inproceedings{bouvrie_invariance_2009,
  title = {On Invariance in Hierarchical Models},
  url = {http://papers.nips.cc/paper/3732-on-invariance-in-hierarchical-models},
  booktitle = {Advances in {{Neural Information Processing Systems}}},
  urldate = {2016-02-25},
  date = {2009},
  pages = {162--170},
  author = {Bouvrie, Jake and Rosasco, Lorenzo and Poggio, Tomaso},
  file = {/Users/fergalcotter/Dropbox/Papers/Bouvrie_2009_On Invariance in Hierarchical Models.pdf;/Users/fergalcotter/Zotero/storage/FEW45XNW/3732-on-invariance-in-hierarchical-models.html},
  note = {00037}
}

@article{he_unsupervised_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.5783},
  primaryClass = {cs},
  title = {Unsupervised {{Feature Learning}} by {{Deep Sparse Coding}}},
  url = {http://arxiv.org/abs/1312.5783},
  abstract = {In this paper, we propose a new unsupervised feature learning framework, namely Deep Sparse Coding (DeepSC), that extends sparse coding to a multi-layer architecture for visual object recognition tasks. The main innovation of the framework is that it connects the sparse-encoders from different layers by a sparse-to-dense module. The sparse-to-dense module is a composition of a local spatial pooling step and a low-dimensional embedding process, which takes advantage of the spatial smoothness information in the image. As a result, the new method is able to learn several levels of sparse representation of the image which capture features at a variety of abstraction levels and simultaneously preserve the spatial smoothness between the neighboring image patches. Combining the feature representations from multiple layers, DeepSC achieves the state-of-the-art performance on multiple object recognition tasks.},
  urldate = {2017-01-24},
  date = {2013-12-19},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {He, Yunlong and Kavukcuoglu, Koray and Wang, Yun and Szlam, Arthur and Qi, Yanjun},
  file = {/Users/fergalcotter/Dropbox/Papers/He et al_2013_Unsupervised Feature Learning by Deep Sparse Coding.pdf;/Users/fergalcotter/Zotero/storage/GFGSG5UP/1312.html}
}

@article{maaten_accelerating_2014,
  title = {Accelerating T-{{SNE}} Using {{Tree}}-{{Based Algorithms}}},
  volume = {15},
  url = {http://www.jmlr.org/papers/v15/vandermaaten14a.html},
  journaltitle = {Journal of Machine Learning Research},
  urldate = {2017-01-31},
  date = {2014},
  pages = {3221-3245},
  author = {van der Maaten, Laurens},
  file = {/Users/fergalcotter/Dropbox/Papers/Maaten_2014_Accelerating t-SNE using Tree-Based Algorithms.pdf;/Users/fergalcotter/Zotero/storage/HBUTIJMP/vandermaaten14a.html}
}

@article{_visualizing_????,
  title = {Visualizing {{High}}-{{Dimensional Data}} Using t-{{SNE}}},
  url = {https://www.researchgate.net/publication/303157903_Visualizing_High-Dimensional_Data_using_t-SNE},
  abstract = {Visualizing High-Dimensional Data using t-SNE on ResearchGate, the professional network for scientists.},
  journaltitle = {ResearchGate},
  urldate = {2017-01-31},
  file = {/Users/fergalcotter/Dropbox/Papers/Visualizing High-Dimensional Data using t-SNE.pdf;/Users/fergalcotter/Zotero/storage/4T6XE3AW/303157903_Visualizing_High-Dimensional_Data_using_t-SNE.html}
}

@article{gangeh_supervised_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1502.05928},
  primaryClass = {cs},
  title = {Supervised {{Dictionary Learning}} and {{Sparse Representation}}-{{A Review}}},
  url = {http://arxiv.org/abs/1502.05928},
  abstract = {Dictionary learning and sparse representation (DLSR) is a recent and successful mathematical model for data representation that achieves state-of-the-art performance in various fields such as pattern recognition, machine learning, computer vision, and medical imaging. The original formulation for DLSR is based on the minimization of the reconstruction error between the original signal and its sparse representation in the space of the learned dictionary. Although this formulation is optimal for solving problems such as denoising, inpainting, and coding, it may not lead to optimal solution in classification tasks, where the ultimate goal is to make the learned dictionary and corresponding sparse representation as discriminative as possible. This motivated the emergence of a new category of techniques, which is appropriately called supervised dictionary learning and sparse representation (S-DLSR), leading to more optimal dictionary and sparse representation in classification tasks. Despite many research efforts for S-DLSR, the literature lacks a comprehensive view of these techniques, their connections, advantages and shortcomings. In this paper, we address this gap and provide a review of the recently proposed algorithms for S-DLSR. We first present a taxonomy of these algorithms into six categories based on the approach taken to include label information into the learning of the dictionary and/or sparse representation. For each category, we draw connections between the algorithms in this category and present a unified framework for them. We then provide guidelines for applied researchers on how to represent and learn the building blocks of an S-DLSR solution based on the problem at hand. This review provides a broad, yet deep, view of the state-of-the-art methods for S-DLSR and allows for the advancement of research and development in this emerging area of research.},
  urldate = {2017-02-07},
  date = {2015-02-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Gangeh, Mehrdad J. and Farahat, Ahmed K. and Ghodsi, Ali and Kamel, Mohamed S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Gangeh et al_2015_Supervised Dictionary Learning and Sparse Representation-A Review.pdf;/Users/fergalcotter/Zotero/storage/CSVWQ567/1502.html}
}

@article{chen_atomic_1998,
  title = {Atomic {{Decomposition}} by {{Basis Pursuit}}},
  volume = {20},
  issn = {1064-8275},
  url = {http://epubs.siam.org/doi/abs/10.1137/S1064827596304010},
  doi = {10.1137/S1064827596304010},
  abstract = {The time-frequency and time-scale communities have recently developed a large number of overcomplete waveform dictionaries --- stationary wavelets, wavelet packets, cosine packets, chirplets, and warplets, to name a few. Decomposition into overcomplete systems is not unique, and several methods for decomposition have been proposed, including the method of frames (MOF), Matching pursuit (MP), and, for special dictionaries, the best orthogonal basis (BOB).Basis Pursuit (BP) is a principle for decomposing a signal into an "optimal" superposition of dictionary elements, where optimal means having the smallest l1 norm of coefficients among all such decompositions. We give examples exhibiting several advantages over MOF, MP, and BOB, including better sparsity and superresolution. BP has interesting relations to ideas in areas as diverse as ill-posed problems, in abstract harmonic analysis, total variation denoising, and multiscale edge denoising.BP in highly overcomplete dictionaries leads to large-scale optimization problems. With signals of length 8192 and a wavelet packet dictionary, one gets an equivalent linear program of size 8192 by 212,992. Such problems can be attacked successfully only because of recent advances in linear programming by interior-point methods. We obtain reasonable success with a primal-dual logarithmic barrier method and conjugate-gradient solver.},
  number = {1},
  journaltitle = {SIAM Journal on Scientific Computing},
  shortjournal = {SIAM J. Sci. Comput.},
  urldate = {2017-02-07},
  date = {1998-01-01},
  pages = {33-61},
  author = {Chen, S. and Donoho, D. and Saunders, M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Chen et al_1998_Atomic Decomposition by Basis Pursuit.pdf;/Users/fergalcotter/Zotero/storage/KW3XP4N3/S1064827596304010.html}
}

@article{tariyal_greedy_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.00203},
  primaryClass = {cs, stat},
  title = {Greedy {{Deep Dictionary Learning}}},
  url = {http://arxiv.org/abs/1602.00203},
  abstract = {In this work we propose a new deep learning tool called deep dictionary learning. Multi-level dictionaries are learnt in a greedy fashion, one layer at a time. This requires solving a simple (shallow) dictionary learning problem, the solution to this is well known. We apply the proposed technique on some benchmark deep learning datasets. We compare our results with other deep learning tools like stacked autoencoder and deep belief network; and state of the art supervised dictionary learning tools like discriminative KSVD and label consistent KSVD. Our method yields better results than all.},
  urldate = {2017-02-07},
  date = {2016-01-31},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Learning,Statistics - Machine Learning},
  author = {Tariyal, Snigdha and Majumdar, Angshul and Singh, Richa and Vatsa, Mayank},
  file = {/Users/fergalcotter/Dropbox/Papers/Tariyal et al_2016_Greedy Deep Dictionary Learning_2.pdf;/Users/fergalcotter/Zotero/storage/NZC2B95A/1602.html}
}

@article{sulam_trainlets:_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.00212},
  title = {Trainlets: {{Dictionary Learning}} in {{High Dimensions}}},
  volume = {64},
  issn = {1053-587X, 1941-0476},
  url = {http://arxiv.org/abs/1602.00212},
  doi = {10.1109/TSP.2016.2540599},
  shorttitle = {Trainlets},
  abstract = {Sparse representations has shown to be a very powerful model for real world signals, and has enabled the development of applications with notable performance. Combined with the ability to learn a dictionary from signal examples, sparsity-inspired algorithms are often achieving state-of-the-art results in a wide variety of tasks. Yet, these methods have traditionally been restricted to small dimensions mainly due to the computational constraints that the dictionary learning problem entails. In the context of image processing, this implies handling small image patches. In this work we show how to efficiently handle bigger dimensions and go beyond the small patches in sparsity-based signal and image processing methods. We build our approach based on a new cropped wavelet decomposition, which enables a multi-scale analysis with virtually no border effects. We then employ this as the base dictionary within a double sparsity model to enable the training of adaptive dictionaries. To cope with the increase of training data, while at the same time improving the training performance, we present an Online Sparse Dictionary Learning (OSDL) algorithm to train this model effectively, enabling it to handle millions of examples. This work shows that dictionary learning can be up-scaled to tackle a new level of signal dimensions, obtaining large adaptable atoms that we call trainlets.},
  number = {12},
  journaltitle = {IEEE Transactions on Signal Processing},
  urldate = {2017-02-07},
  date = {2016-06},
  pages = {3180-3193},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Key Paper},
  author = {Sulam, Jeremias and Ophir, Boaz and Zibulevsky, Michael and Elad, Michael},
  file = {/Users/fergalcotter/Dropbox/Papers/Sulam et al_2016_Trainlets.pdf;/Users/fergalcotter/Zotero/storage/IKVQQDFJ/1602.html}
}

@article{mairal_sparse_2014,
  langid = {english},
  title = {Sparse {{Modeling}} for {{Image}} and {{Vision Processing}}},
  volume = {8},
  issn = {1572-2740, 1572-2759},
  url = {http://www.nowpublishers.com/article/Details/CGV-058},
  doi = {10.1561/0600000058},
  abstract = {Sparse Modeling for Image and Vision Processing},
  number = {2-3},
  journaltitle = {Foundations and Trends® in Computer Graphics and Vision},
  shortjournal = {CGV},
  urldate = {2017-02-07},
  date = {2014-12-19},
  pages = {85-283},
  author = {Mairal, Julien and Bach, Francis and Ponce, Jean},
  file = {/Users/fergalcotter/Dropbox/Papers/Mairal et al_2014_Sparse Modeling for Image and Vision Processing.pdf;/Users/fergalcotter/Zotero/storage/DA7CGKHS/CGV-058.html}
}

@article{aharon_k-svd:_2006,
  title = {K-{{SVD}}: {{An Algorithm}} for {{Designing Overcomplete Dictionaries}} for {{Sparse Representation}}},
  volume = {54},
  issn = {1053-587X},
  doi = {10.1109/TSP.2006.881199},
  shorttitle = {-{{SVD}}},
  abstract = {In recent years there has been a growing interest in the study of sparse representation of signals. Using an overcomplete dictionary that contains prototype signal-atoms, signals are described by sparse linear combinations of these atoms. Applications that use sparse representation are many and include compression, regularization in inverse problems, feature extraction, and more. Recent activity in this field has concentrated mainly on the study of pursuit algorithms that decompose signals with respect to a given dictionary. Designing dictionaries to better fit the above model can be done by either selecting one from a prespecified set of linear transforms or adapting the dictionary to a set of training signals. Both of these techniques have been considered, but this topic is largely still open. In this paper we propose a novel algorithm for adapting dictionaries in order to achieve sparse signal representations. Given a set of training signals, we seek the dictionary that leads to the best representation for each member in this set, under strict sparsity constraints. We present a new method-the K-SVD algorithm-generalizing the K-means clustering process. K-SVD is an iterative method that alternates between sparse coding of the examples based on the current dictionary and a process of updating the dictionary atoms to better fit the data. The update of the dictionary columns is combined with an update of the sparse representations, thereby accelerating convergence. The K-SVD algorithm is flexible and can work with any pursuit method (e.g., basis pursuit, FOCUSS, or matching pursuit). We analyze this algorithm and demonstrate its results both on synthetic tests and in applications on real image data},
  number = {11},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {2006-11},
  pages = {4311-4322},
  keywords = {Iterative algorithms,Training,transforms,Algorithm design and analysis,Atom decomposition,Clustering algorithms,Dictionaries,FOCUSS,Inverse problems,K-SVD,K-means clustering process,Matching pursuit algorithms,Prototypes,Pursuit algorithms,Signal design,basis pursuit,codebook,dictionary,gain-shape VQ,image data,iterative method,iterative methods,linear transforms,matching pursuit,overcomplete dictionary,signals sparse representation,singular value decomposition,sparse coding,sparse representation,sparsity constraints,vector quantization,Feature extraction,Image coding,Image representation},
  author = {Aharon, M. and Elad, M. and Bruckstein, A.},
  file = {/Users/fergalcotter/Dropbox/Papers/Aharon et al_2006_-SVD.pdf;/Users/fergalcotter/Dropbox/Papers/Notes/ksvd_notes.pdf;/Users/fergalcotter/Zotero/storage/XXB5BWC4/1710377.html}
}

@article{mallat_matching_1993,
  title = {Matching Pursuits with Time-Frequency Dictionaries},
  volume = {41},
  issn = {1053-587X},
  doi = {10.1109/78.258082},
  abstract = {The authors introduce an algorithm, called matching pursuit, that decomposes any signal into a linear expansion of waveforms that are selected from a redundant dictionary of functions. These waveforms are chosen in order to best match the signal structures. Matching pursuits are general procedures to compute adaptive signal representations. With a dictionary of Gabor functions a matching pursuit defines an adaptive time-frequency transform. They derive a signal energy distribution in the time-frequency plane, which does not include interference terms, unlike Wigner and Cohen class distributions. A matching pursuit isolates the signal structures that are coherent with respect to a given dictionary. An application to pattern extraction from noisy signals is described. They compare a matching pursuit decomposition with a signal expansion over an optimized wavepacket orthonormal basis, selected with the algorithm of Coifman and Wickerhauser see (IEEE Trans. Informat. Theory, vol. 38, Mar. 1992)},
  number = {12},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {1993-12},
  pages = {3397-3415},
  keywords = {Fourier transforms,signal processing,wavelet transforms,Dictionaries,Gabor functions,Interference,Matching pursuit algorithms,Natural languages,Pursuit algorithms,Signal processing algorithms,Signal representations,Time frequency analysis,Vocabulary,adaptive signal representations,adaptive time-frequency transform,linear waveform expansion,matching pursuit algorithm,matching pursuit decomposition,noisy signals,optimized wavepacket orthonormal basis,pattern extraction,signal energy distribution,signal expansion,signal structures,time-frequency analysis,time-frequency dictionaries,time-frequency plane},
  author = {Mallat, S. G. and Zhang, Zhifeng},
  file = {/Users/fergalcotter/Dropbox/Papers/Mallat_Zhang_1993_Matching pursuits with time-frequency dictionaries.pdf;/Users/fergalcotter/Zotero/storage/B8W6HX52/258082.html}
}

@inproceedings{_expected_????,
  title = {Expected {{Patch Log Likelihood}} with a {{Sparse Prior}} ({{PDF Download Available}})},
  url = {https://www.researchgate.net/publication/280297742_Expected_Patch_Log_Likelihood_with_a_Sparse_Prior},
  doi = {http://dx.doi.org/10.1007/978-3-319-14612-6_8},
  abstract = {Official Full-Text Publication: Expected Patch Log Likelihood with a Sparse Prior on ResearchGate, the professional network for scientists.},
  booktitle = {{{ResearchGate}}},
  urldate = {2017-02-13},
  file = {/Users/fergalcotter/Dropbox/Papers/Expected Patch Log Likelihood with a Sparse Prior (PDF Download Available).pdf;/Users/fergalcotter/Zotero/storage/XQEF7U4R/280297742_Expected_Patch_Log_Likelihood_with_a_Sparse_Prior.html}
}

@article{rubinstein_dictionaries_2010,
  title = {Dictionaries for {{Sparse Representation Modeling}}},
  volume = {98},
  issn = {0018-9219},
  doi = {10.1109/JPROC.2010.2040551},
  abstract = {Sparse and redundant representation modeling of data assumes an ability to describe signals as linear combinations of a few atoms from a pre-specified dictionary. As such, the choice of the dictionary that sparsifies the signals is crucial for the success of this model. In general, the choice of a proper dictionary can be done using one of two ways: i) building a sparsifying dictionary based on a mathematical model of the data, or ii) learning a dictionary to perform best on a training set. In this paper we describe the evolution of these two paradigms. As manifestations of the first approach, we cover topics such as wavelets, wavelet packets, contourlets, and curvelets, all aiming to exploit 1-D and 2-D mathematical models for constructing effective dictionaries for signals and images. Dictionary learning takes a different route, attaching the dictionary to a set of examples it is supposed to serve. From the seminal work of Field and Olshausen, through the MOD, the K-SVD, the Generalized PCA and others, this paper surveys the various options such training has to offer, up to the most recent contributions and structures.},
  number = {6},
  journaltitle = {Proceedings of the IEEE},
  date = {2010-06},
  pages = {1045-1057},
  keywords = {Harmonic analysis,Key Paper,Mathematical model,Wavelet packets,principal component analysis,signal processing,wavelet transforms,Dictionaries,Signal representations,sparse coding,sparse representation,signal representation,signal sampling,dictionary learning,mathematical data model,redundant signal representation modeling,sparse signal representation modeling,training set,Displays,Joining processes,Sampling methods,signal approximation},
  author = {Rubinstein, R. and Bruckstein, A. M. and Elad, M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rubinstein et al_2010_Dictionaries for Sparse Representation Modeling.pdf;/Users/fergalcotter/Zotero/storage/WNJQUHTU/5452966.html}
}

@article{rubinstein_double_2010,
  title = {Double {{Sparsity}}: {{Learning Sparse Dictionaries}} for {{Sparse Signal Approximation}}},
  volume = {58},
  issn = {1053-587X},
  doi = {10.1109/TSP.2009.2036477},
  shorttitle = {Double {{Sparsity}}},
  abstract = {An efficient and flexible dictionary structure is proposed for sparse and redundant signal representation. The proposed sparse dictionary is based on a sparsity model of the dictionary atoms over a base dictionary, and takes the form D = ?? A, where ?? is a fixed base dictionary and A is sparse. The sparse dictionary provides efficient forward and adjoint operators, has a compact representation, and can be effectively trained from given example data. In this, the sparse structure bridges the gap between implicit dictionaries, which have efficient implementations yet lack adaptability, and explicit dictionaries, which are fully adaptable but non-efficient and costly to deploy. In this paper, we discuss the advantages of sparse dictionaries, and present an efficient algorithm for training them. We demonstrate the advantages of the proposed structure for 3-D image denoising.},
  number = {3},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {2010-03},
  pages = {1553-1564},
  keywords = {image denoising,K-SVD,sparse coding,sparse representation,signal representation,dictionary learning,sparse matrices,3D image denoising,computed tomography,double sparsity,learning sparse dictionaries,sparse signal approximation,signal denoising,Image coding},
  author = {Rubinstein, R. and Zibulevsky, M. and Elad, M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rubinstein et al_2010_Double Sparsity.pdf;/Users/fergalcotter/Zotero/storage/QDIJ7KZV/5325694.html}
}

@article{adler_compressed_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1610.09615},
  primaryClass = {cs},
  title = {Compressed {{Learning}}: {{A Deep Neural Network Approach}}},
  url = {http://arxiv.org/abs/1610.09615},
  shorttitle = {Compressed {{Learning}}},
  abstract = {Compressed Learning (CL) is a joint signal processing and machine learning framework for inference from a signal, using a small number of measurements obtained by linear projections of the signal. In this paper we present an end-to-end deep learning approach for CL, in which a network composed of fully-connected layers followed by convolutional layers perform the linear sensing and non-linear inference stages. During the training phase, the sensing matrix and the non-linear inference operator are jointly optimized, and the proposed approach outperforms state-of-the-art for the task of image classification. For example, at a sensing rate of 1\% (only 8 measurements of 28 X 28 pixels images), the classification error for the MNIST handwritten digits dataset is 6.46\% compared to 41.06\% with state-of-the-art.},
  urldate = {2017-02-14},
  date = {2016-10-30},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Useful},
  author = {Adler, Amir and Elad, Michael and Zibulevsky, Michael},
  file = {/Users/fergalcotter/Dropbox/Papers/Adler et al_2016_Compressed Learning.pdf;/Users/fergalcotter/Zotero/storage/ET8FKJSV/1610.html}
}

@inproceedings{mairal_online_2009,
  location = {{New York, NY, USA}},
  title = {Online {{Dictionary Learning}} for {{Sparse Coding}}},
  isbn = {978-1-60558-516-1},
  url = {http://doi.acm.org/10.1145/1553374.1553463},
  doi = {10.1145/1553374.1553463},
  abstract = {Sparse coding---that is, modelling data vectors as sparse linear combinations of basis elements---is widely used in machine learning, neuroscience, signal processing, and statistics. This paper focuses on learning the basis set, also called dictionary, to adapt it to specific data, an approach that has recently proven to be very effective for signal reconstruction and classification in the audio and image processing domains. This paper proposes a new online optimization algorithm for dictionary learning, based on stochastic approximations, which scales up gracefully to large datasets with millions of training samples. A proof of convergence is presented, along with experiments with natural images demonstrating that it leads to faster performance and better dictionaries than classical batch algorithms for both small and large datasets.},
  booktitle = {Proceedings of the 26th {{Annual International Conference}} on {{Machine Learning}}},
  series = {{{ICML}} '09},
  publisher = {{ACM}},
  urldate = {2017-02-15},
  date = {2009},
  pages = {689--696},
  author = {Mairal, Julien and Bach, Francis and Ponce, Jean and Sapiro, Guillermo},
  file = {/Users/fergalcotter/Dropbox/Papers/Mairal et al_2009_Online Dictionary Learning for Sparse Coding.pdf}
}

@incollection{goodfellow_generative_2014,
  title = {Generative {{Adversarial Nets}}},
  url = {http://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 27},
  publisher = {{Curran Associates, Inc.}},
  urldate = {2017-02-15},
  date = {2014},
  pages = {2672--2680},
  author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
  editor = {Ghahramani, Z. and Welling, M. and Cortes, C. and Lawrence, N. D. and Weinberger, K. Q.},
  file = {/Users/fergalcotter/Dropbox/Papers/Goodfellow et al_2014_Generative Adversarial Nets.pdf;/Users/fergalcotter/Zotero/storage/NM7PW2TB/5423-generative-adversarial-nets.html}
}

@article{goodfellow_nips_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1701.00160},
  primaryClass = {cs},
  title = {{{NIPS}} 2016 {{Tutorial}}: {{Generative Adversarial Networks}}},
  url = {http://arxiv.org/abs/1701.00160},
  shorttitle = {{{NIPS}} 2016 {{Tutorial}}},
  abstract = {This report summarizes the tutorial presented by the author at NIPS 2016 on generative adversarial networks (GANs). The tutorial describes: (1) Why generative modeling is a topic worth studying, (2) how generative models work, and how GANs compare to other generative models, (3) the details of how GANs work, (4) research frontiers in GANs, and (5) state-of-the-art image models that combine GANs with other methods. Finally, the tutorial contains three exercises for readers to complete, and the solutions to these exercises.},
  urldate = {2017-02-15},
  date = {2016-12-31},
  keywords = {Computer Science - Learning},
  author = {Goodfellow, Ian},
  file = {/Users/fergalcotter/Dropbox/Papers/Goodfellow_2016_NIPS 2016 Tutorial.pdf;/Users/fergalcotter/Zotero/storage/2GD4N3H2/1701.html}
}

@article{radford_unsupervised_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1511.06434},
  primaryClass = {cs},
  title = {Unsupervised {{Representation Learning}} with {{Deep Convolutional Generative Adversarial Networks}}},
  url = {http://arxiv.org/abs/1511.06434},
  abstract = {In recent years, supervised learning with convolutional networks (CNNs) has seen huge adoption in computer vision applications. Comparatively, unsupervised learning with CNNs has received less attention. In this work we hope to help bridge the gap between the success of CNNs for supervised learning and unsupervised learning. We introduce a class of CNNs called deep convolutional generative adversarial networks (DCGANs), that have certain architectural constraints, and demonstrate that they are a strong candidate for unsupervised learning. Training on various image datasets, we show convincing evidence that our deep convolutional adversarial pair learns a hierarchy of representations from object parts to scenes in both the generator and discriminator. Additionally, we use the learned features for novel tasks - demonstrating their applicability as general image representations.},
  urldate = {2017-02-15},
  date = {2015-11-19},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning},
  author = {Radford, Alec and Metz, Luke and Chintala, Soumith},
  file = {/Users/fergalcotter/Dropbox/Papers/Radford et al_2015_Unsupervised Representation Learning with Deep Convolutional Generative.pdf;/Users/fergalcotter/Zotero/storage/NHXIPIKA/1511.html}
}

@article{zhang_accelerating_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1505.06798},
  primaryClass = {cs},
  title = {Accelerating {{Very Deep Convolutional Networks}} for {{Classification}} and {{Detection}}},
  url = {http://arxiv.org/abs/1505.06798},
  abstract = {This paper aims to accelerate the test-time computation of convolutional neural networks (CNNs), especially very deep CNNs that have substantially impacted the computer vision community. Unlike previous methods that are designed for approximating linear filters or linear responses, our method takes the nonlinear units into account. We develop an effective solution to the resulting nonlinear optimization problem without the need of stochastic gradient descent (SGD). More importantly, while previous methods mainly focus on optimizing one or two layers, our nonlinear method enables an asymmetric reconstruction that reduces the rapidly accumulated error when multiple (e.g., {$>$}=10) layers are approximated. For the widely used very deep VGG-16 model, our method achieves a whole-model speedup of 4x with merely a 0.3\% increase of top-5 error in ImageNet classification. Our 4x accelerated VGG-16 model also shows a graceful accuracy degradation for object detection when plugged into the Fast R-CNN detector.},
  urldate = {2017-02-21},
  date = {2015-05-25},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Zhang, Xiangyu and Zou, Jianhua and He, Kaiming and Sun, Jian},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhang et al_2015_Accelerating Very Deep Convolutional Networks for Classification and Detection.pdf;/Users/fergalcotter/Zotero/storage/8MR8FV39/1505.html}
}

@inproceedings{denton_exploiting_2014,
  location = {{Cambridge, MA, USA}},
  title = {Exploiting {{Linear Structure Within Convolutional Networks}} for {{Efficient Evaluation}}},
  url = {http://dl.acm.org/citation.cfm?id=2968826.2968968},
  abstract = {We present techniques for speeding up the test-time evaluation of large convolutional networks, designed for object recognition tasks. These models deliver impressive accuracy but each image evaluation requires millions of floating point operations, making their deployment on smartphones and Internet-scale clusters problematic. The computation is dominated by the convolution operations in the lower layers of the model. We exploit the linear structure present within the convolutional filters to derive approximations that significantly reduce the required computation. Using large state-of-the-art models, we demonstrate we demonstrate speedups of convolutional layers on both CPU and GPU by a factor of 2x, while keeping the accuracy within 1\% of the original model.},
  booktitle = {Proceedings of the 27th {{International Conference}} on {{Neural Information Processing Systems}} - {{Volume}} 1},
  series = {{{NIPS}}'14},
  publisher = {{MIT Press}},
  urldate = {2017-02-21},
  date = {2014},
  pages = {1269--1277},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning},
  author = {Denton, Emily and Zaremba, Wojciech and Bruna, Joan and LeCun, Yann and Fergus, Rob},
  file = {/Users/fergalcotter/Dropbox/Papers/Denton et al_2014_Exploiting Linear Structure Within Convolutional Networks for Efficient.pdf;/Users/fergalcotter/Zotero/storage/ZAK9TKEZ/1404.html},
  venue = {Montreal, Canada}
}

@article{denton_deep_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1506.05751},
  primaryClass = {cs},
  title = {Deep {{Generative Image Models}} Using a {{Laplacian Pyramid}} of {{Adversarial Networks}}},
  url = {http://arxiv.org/abs/1506.05751},
  abstract = {In this paper we introduce a generative parametric model capable of producing high quality samples of natural images. Our approach uses a cascade of convolutional networks within a Laplacian pyramid framework to generate images in a coarse-to-fine fashion. At each level of the pyramid, a separate generative convnet model is trained using the Generative Adversarial Nets (GAN) approach (Goodfellow et al.). Samples drawn from our model are of significantly higher quality than alternate approaches. In a quantitative assessment by human evaluators, our CIFAR10 samples were mistaken for real images around 40\% of the time, compared to 10\% for samples drawn from a GAN baseline model. We also show samples from models trained on the higher resolution images of the LSUN scene dataset.},
  urldate = {2017-02-21},
  date = {2015-06-18},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Denton, Emily and Chintala, Soumith and Szlam, Arthur and Fergus, Rob},
  file = {/Users/fergalcotter/Dropbox/Papers/Denton et al_2015_Deep Generative Image Models using a Laplacian Pyramid of Adversarial Networks.pdf;/Users/fergalcotter/Zotero/storage/2MCCPMSW/1506.html}
}

@article{salimans_improved_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1606.03498},
  primaryClass = {cs},
  title = {Improved {{Techniques}} for {{Training GANs}}},
  url = {http://arxiv.org/abs/1606.03498},
  abstract = {We present a variety of new architectural features and training procedures that we apply to the generative adversarial networks (GANs) framework. We focus on two applications of GANs: semi-supervised learning, and the generation of images that humans find visually realistic. Unlike most work on generative models, our primary goal is not to train a model that assigns high likelihood to test data, nor do we require the model to be able to learn well without using any labels. Using our new techniques, we achieve state-of-the-art results in semi-supervised classification on MNIST, CIFAR-10 and SVHN. The generated images are of high quality as confirmed by a visual Turing test: our model generates MNIST samples that humans cannot distinguish from real data, and CIFAR-10 samples that yield a human error rate of 21.3\%. We also present ImageNet samples with unprecedented resolution and show that our methods enable the model to learn recognizable features of ImageNet classes.},
  urldate = {2017-02-21},
  date = {2016-06-10},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Salimans, Tim and Goodfellow, Ian and Zaremba, Wojciech and Cheung, Vicki and Radford, Alec and Chen, Xi},
  file = {/Users/fergalcotter/Dropbox/Papers/Salimans et al_2016_Improved Techniques for Training GANs.pdf;/Users/fergalcotter/Zotero/storage/6UCWQ2HP/1606.html}
}

@article{jaderberg_speeding_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1405.3866},
  primaryClass = {cs},
  title = {Speeding up {{Convolutional Neural Networks}} with {{Low Rank Expansions}}},
  url = {http://arxiv.org/abs/1405.3866},
  abstract = {The focus of this paper is speeding up the evaluation of convolutional neural networks. While delivering impressive results across a range of computer vision and machine learning tasks, these networks are computationally demanding, limiting their deployability. Convolutional layers generally consume the bulk of the processing time, and so in this work we present two simple schemes for drastically speeding up these layers. This is achieved by exploiting cross-channel or filter redundancy to construct a low rank basis of filters that are rank-1 in the spatial domain. Our methods are architecture agnostic, and can be easily applied to existing CPU and GPU convolutional frameworks for tuneable speedup performance. We demonstrate this with a real world network designed for scene text character recognition, showing a possible 2.5x speedup with no loss in accuracy, and 4.5x speedup with less than 1\% drop in accuracy, still achieving state-of-the-art on standard benchmarks.},
  urldate = {2017-02-21},
  date = {2014-05-15},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Jaderberg, Max and Vedaldi, Andrea and Zisserman, Andrew},
  file = {/Users/fergalcotter/Dropbox/Papers/Jaderberg et al_2014_Speeding up Convolutional Neural Networks with Low Rank Expansions_2.pdf;/Users/fergalcotter/Zotero/storage/F6DUMUDM/1405.html}
}

@article{goodfellow_generative_2014-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1406.2661},
  primaryClass = {cs, stat},
  title = {Generative {{Adversarial Networks}}},
  url = {http://arxiv.org/abs/1406.2661},
  abstract = {We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to 1/2 everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.},
  urldate = {2017-02-21},
  date = {2014-06-10},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Goodfellow, Ian J. and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
  file = {/Users/fergalcotter/Dropbox/Papers/Goodfellow et al_2014_Generative Adversarial Networks_2.pdf;/Users/fergalcotter/Zotero/storage/VW9RF5DG/1406.html}
}

@inproceedings{rigamonti_learning_2013,
  location = {{Portland, Oregon}},
  title = {Learning {{Separable Filters}}},
  doi = {10.1109/CVPR.2013.355},
  abstract = {Learning filters to produce sparse image representations in terms of over complete dictionaries has emerged as a powerful way to create image features for many different purposes. Unfortunately, these filters are usually both numerous and non-separable, making their use computationally expensive. In this paper, we show that such filters can be computed as linear combinations of a smaller number of separable ones, thus greatly reducing the computational complexity at no cost in terms of performance. This makes filter learning approaches practical even for large images or 3D volumes, and we show that we significantly outperform state-of-the-art methods on the linear structure extraction task, in terms of both accuracy and speed. Moreover, our approach is general and can be used on generic filter banks to reduce the complexity of the convolutions.},
  eventtitle = {2013 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2013 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2013-06},
  pages = {2754-2761},
  keywords = {Biomedical imaging,computational complexity,computer vision,Convolution,convolution complexity reduction,Dictionaries,Feature extraction,filter learning approaches,filtering theory,generic filter banks,Image representation,learning separable filters,Linear programming,linear structure extraction task,optimization,sparse image representations,Three-dimensional displays},
  author = {Rigamonti, R. and Sironi, A. and Lepetit, V. and Fua, P.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rigamonti et al_2013_Learning Separable Filters.pdf;/Users/fergalcotter/Zotero/storage/X6DQBAPW/6619199.html}
}

@inproceedings{huang_densely_2017,
  archivePrefix = {arXiv},
%  eprinttype = {arxiv},
%  eprint = {1608.06993},
  title = {Densely {{Connected Convolutional Networks}}},
  url = {http://arxiv.org/abs/1608.06993},
  doi = {10.1109/CVPR.2017.243},
  abstract = {Recent work has shown that convolutional networks can be substantially deeper, more accurate, and efficient to train if they contain shorter connections between layers close to the input and those close to the output. In this paper, we embrace this observation and introduce the Dense Convolutional Network (DenseNet), which connects each layer to every other layer in a feed-forward fashion. Whereas traditional convolutional networks with L layers have L connections - one between each layer and its subsequent layer - our network has L(L+1)/2 direct connections. For each layer, the feature-maps of all preceding layers are used as inputs, and its own feature-maps are used as inputs into all subsequent layers. DenseNets have several compelling advantages: they alleviate the vanishing-gradient problem, strengthen feature propagation, encourage feature reuse, and substantially reduce the number of parameters. We evaluate our proposed architecture on four highly competitive object recognition benchmark tasks (CIFAR-10, CIFAR-100, SVHN, and ImageNet). DenseNets obtain significant improvements over the state-of-the-art on most of them, whilst requiring less memory and computation to achieve high performance. Code and models are available at https://github.com/liuzhuang13/DenseNet .},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2017 {{IEEE Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  urldate = {2017-03-02},
  date = {2017-07},
  pages = {2261-2269},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,convolution,Convolution,Convolutional codes,dense convolutional network,DenseNet,feature propagation,feature reuse,feedforward neural nets,learning (artificial intelligence),Network architecture,Neural networks,object recognition benchmark tasks,Road transportation,traditional convolutional networks,Training,vanishing-gradient problem},
  author = {Huang, Gao and Liu, Zhuang and Weinberger, Kilian Q. and van der Maaten, Laurens},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Dropbox/Papers/Huang et al_2016_Densely Connected Convolutional Networks.pdf;/Users/fergalcotter/Dropbox/Papers/Huang et al_2017_Densely Connected Convolutional Networks.pdf;/Users/fergalcotter/Zotero/storage/56IB4VD2/1608.html;/Users/fergalcotter/Zotero/storage/NNAE5H9V/8099726.html}
}

@article{veit_residual_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1605.06431},
  primaryClass = {cs},
  title = {Residual {{Networks Behave Like Ensembles}} of {{Relatively Shallow Networks}}},
  url = {http://arxiv.org/abs/1605.06431},
  abstract = {In this work we propose a novel interpretation of residual networks showing that they can be seen as a collection of many paths of differing length. Moreover, residual networks seem to enable very deep networks by leveraging only the short paths during training. To support this observation, we rewrite residual networks as an explicit collection of paths. Unlike traditional models, paths through residual networks vary in length. Further, a lesion study reveals that these paths show ensemble-like behavior in the sense that they do not strongly depend on each other. Finally, and most surprising, most paths are shorter than one might expect, and only the short paths are needed during training, as longer paths do not contribute any gradient. For example, most of the gradient in a residual network with 110 layers comes from paths that are only 10-34 layers deep. Our results reveal one of the key characteristics that seem to enable the training of very deep networks: Residual networks avoid the vanishing gradient problem by introducing short paths which can carry gradient throughout the extent of very deep networks.},
  urldate = {2017-03-02},
  date = {2016-05-20},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Veit, Andreas and Wilber, Michael and Belongie, Serge},
  file = {/Users/fergalcotter/Dropbox/Papers/Veit et al_2016_Residual Networks Behave Like Ensembles of Relatively Shallow Networks.pdf;/Users/fergalcotter/Zotero/storage/U8BVPSPN/1605.html}
}

@article{johnson_perceptual_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1603.08155},
  primaryClass = {cs},
  title = {Perceptual {{Losses}} for {{Real}}-{{Time Style Transfer}} and {{Super}}-{{Resolution}}},
  url = {http://arxiv.org/abs/1603.08155},
  abstract = {We consider image transformation problems, where an input image is transformed into an output image. Recent methods for such problems typically train feed-forward convolutional neural networks using a \textbackslash{}emph\{per-pixel\} loss between the output and ground-truth images. Parallel work has shown that high-quality images can be generated by defining and optimizing \textbackslash{}emph\{perceptual\} loss functions based on high-level features extracted from pretrained networks. We combine the benefits of both approaches, and propose the use of perceptual loss functions for training feed-forward networks for image transformation tasks. We show results on image style transfer, where a feed-forward network is trained to solve the optimization problem proposed by Gatys et al in real-time. Compared to the optimization-based method, our network gives similar qualitative results but is three orders of magnitude faster. We also experiment with single-image super-resolution, where replacing a per-pixel loss with a perceptual loss gives visually pleasing results.},
  urldate = {2017-03-08},
  date = {2016-03-26},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning},
  author = {Johnson, Justin and Alahi, Alexandre and Fei-Fei, Li},
  file = {/Users/fergalcotter/Dropbox/Papers/Johnson et al_2016_Perceptual Losses for Real-Time Style Transfer and Super-Resolution.pdf;/Users/fergalcotter/Zotero/storage/KQNHMKDT/1603.html}
}

@article{he_powerful_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1606.04801},
  primaryClass = {cs},
  title = {A {{Powerful Generative Model Using Random Weights}} for the {{Deep Image Representation}}},
  url = {http://arxiv.org/abs/1606.04801},
  abstract = {To what extent is the success of deep visualization due to the training? Could we do deep visualization using untrained, random weight networks? To address this issue, we explore new and powerful generative models for three popular deep visualization tasks using untrained, random weight convolutional neural networks. First we invert representations in feature spaces and reconstruct images from white noise inputs. The reconstruction quality is statistically higher than that of the same method applied on well trained networks with the same architecture. Next we synthesize textures using scaled correlations of representations in multiple layers and our results are almost indistinguishable with the original natural texture and the synthesized textures based on the trained network. Third, by recasting the content of an image in the style of various artworks, we create artistic images with high perceptual quality, highly competitive to the prior work of Gatys et al. on pretrained networks. To our knowledge this is the first demonstration of image representations using untrained deep neural networks. Our work provides a new and fascinating tool to study the representation of deep network architecture and sheds light on new understandings on deep visualization.},
  urldate = {2017-03-09},
  date = {2016-06-15},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {He, Kun and Wang, Yan and Hopcroft, John},
  file = {/Users/fergalcotter/Dropbox/Papers/He et al_2016_A Powerful Generative Model Using Random Weights for the Deep Image.pdf;/Users/fergalcotter/Zotero/storage/ETV53D68/1606.html}
}

@article{dosovitskiy_generating_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.02644},
  primaryClass = {cs},
  title = {Generating {{Images}} with {{Perceptual Similarity Metrics}} Based on {{Deep Networks}}},
  url = {http://arxiv.org/abs/1602.02644},
  abstract = {Image-generating machine learning models are typically trained with loss functions based on distance in the image space. This often leads to over-smoothed results. We propose a class of loss functions, which we call deep perceptual similarity metrics (DeePSiM), that mitigate this problem. Instead of computing distances in the image space, we compute distances between image features extracted by deep neural networks. This metric better reflects perceptually similarity of images and thus leads to better results. We show three applications: autoencoder training, a modification of a variational autoencoder, and inversion of deep convolutional networks. In all cases, the generated images look sharp and resemble natural images.},
  urldate = {2017-03-09},
  date = {2016-02-08},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Dosovitskiy, Alexey and Brox, Thomas},
  file = {/Users/fergalcotter/Dropbox/Papers/Dosovitskiy_Brox_2016_Generating Images with Perceptual Similarity Metrics based on Deep Networks.pdf;/Users/fergalcotter/Zotero/storage/KMGSNMXW/1602.html}
}

@article{nguyen_synthesizing_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1605.09304},
  primaryClass = {cs},
  title = {Synthesizing the Preferred Inputs for Neurons in Neural Networks via Deep Generator Networks},
  url = {http://arxiv.org/abs/1605.09304},
  abstract = {Deep neural networks (DNNs) have demonstrated state-of-the-art results on many pattern recognition tasks, especially vision classification problems. Understanding the inner workings of such computational brains is both fascinating basic science that is interesting in its own right - similar to why we study the human brain - and will enable researchers to further improve DNNs. One path to understanding how a neural network functions internally is to study what each of its neurons has learned to detect. One such method is called activation maximization (AM), which synthesizes an input (e.g. an image) that highly activates a neuron. Here we dramatically improve the qualitative state of the art of activation maximization by harnessing a powerful, learned prior: a deep generator network (DGN). The algorithm (1) generates qualitatively state-of-the-art synthetic images that look almost real, (2) reveals the features learned by each neuron in an interpretable way, (3) generalizes well to new datasets and somewhat well to different network architectures without requiring the prior to be relearned, and (4) can be considered as a high-quality generative method (in this case, by generating novel, creative, interesting, recognizable images).},
  urldate = {2017-03-09},
  date = {2016-05-30},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Nguyen, Anh and Dosovitskiy, Alexey and Yosinski, Jason and Brox, Thomas and Clune, Jeff},
  file = {/Users/fergalcotter/Dropbox/Papers/Nguyen et al_2016_Synthesizing the preferred inputs for neurons in neural networks via deep_2.pdf;/Users/fergalcotter/Zotero/storage/D62NE4UU/Nguyen et al. - 2016 - Synthesizing the preferred inputs for neurons in n.pdf;/Users/fergalcotter/Zotero/storage/8DNN3F99/1605.html}
}

@article{yosinski_understanding_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1506.06579},
  primaryClass = {cs},
  title = {Understanding {{Neural Networks Through Deep Visualization}}},
  url = {http://arxiv.org/abs/1506.06579},
  abstract = {Recent years have produced great advances in training large, deep neural networks (DNNs), including notable successes in training convolutional neural networks (convnets) to recognize natural images. However, our understanding of how these models work, especially what computations they perform at intermediate layers, has lagged behind. Progress in the field will be further accelerated by the development of better tools for visualizing and interpreting neural nets. We introduce two such tools here. The first is a tool that visualizes the activations produced on each layer of a trained convnet as it processes an image or video (e.g. a live webcam stream). We have found that looking at live activations that change in response to user input helps build valuable intuitions about how convnets work. The second tool enables visualizing features at each layer of a DNN via regularized optimization in image space. Because previous versions of this idea produced less recognizable images, here we introduce several new regularization methods that combine to produce qualitatively clearer, more interpretable visualizations. Both tools are open source and work on a pre-trained convnet with minimal setup.},
  urldate = {2017-03-09},
  date = {2015-06-22},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Yosinski, Jason and Clune, Jeff and Nguyen, Anh and Fuchs, Thomas and Lipson, Hod},
  file = {/Users/fergalcotter/Dropbox/Papers/Yosinski et al_2015_Understanding Neural Networks Through Deep Visualization.pdf;/Users/fergalcotter/Zotero/storage/9669FIIF/1506.html}
}

@article{nguyen_multifaceted_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.03616},
  primaryClass = {cs},
  title = {Multifaceted {{Feature Visualization}}: {{Uncovering}} the {{Different Types}} of {{Features Learned By Each Neuron}} in {{Deep Neural Networks}}},
  url = {http://arxiv.org/abs/1602.03616},
  shorttitle = {Multifaceted {{Feature Visualization}}},
  abstract = {We can better understand deep neural networks by identifying which features each of their neurons have learned to detect. To do so, researchers have created Deep Visualization techniques including activation maximization, which synthetically generates inputs (e.g. images) that maximally activate each neuron. A limitation of current techniques is that they assume each neuron detects only one type of feature, but we know that neurons can be multifaceted, in that they fire in response to many different types of features: for example, a grocery store class neuron must activate either for rows of produce or for a storefront. Previous activation maximization techniques constructed images without regard for the multiple different facets of a neuron, creating inappropriate mixes of colors, parts of objects, scales, orientations, etc. Here, we introduce an algorithm that explicitly uncovers the multiple facets of each neuron by producing a synthetic visualization of each of the types of images that activate a neuron. We also introduce regularization methods that produce state-of-the-art results in terms of the interpretability of images obtained by activation maximization. By separately synthesizing each type of image a neuron fires in response to, the visualizations have more appropriate colors and coherent global structure. Multifaceted feature visualization thus provides a clearer and more comprehensive description of the role of each neuron.},
  urldate = {2017-03-09},
  date = {2016-02-11},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Neural and Evolutionary Computing},
  author = {Nguyen, Anh and Yosinski, Jason and Clune, Jeff},
  file = {/Users/fergalcotter/Dropbox/Papers/Nguyen et al_2016_Multifaceted Feature Visualization.pdf;/Users/fergalcotter/Zotero/storage/RPE492AD/1602.html}
}

@article{nguyen_plug_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1612.00005},
  primaryClass = {cs},
  title = {Plug \& {{Play Generative Networks}}: {{Conditional Iterative Generation}} of {{Images}} in {{Latent Space}}},
  url = {http://arxiv.org/abs/1612.00005},
  shorttitle = {Plug \& {{Play Generative Networks}}},
  abstract = {Generating high-resolution, photo-realistic images has been a long-standing goal in machine learning. Recently, Nguyen et al. (2016) showed one interesting way to synthesize novel images by performing gradient ascent in the latent space of a generator network to maximize the activations of one or multiple neurons in a separate classifier network. In this paper we extend this method by introducing an additional prior on the latent code, improving both sample quality and sample diversity, leading to a state-of-the-art generative model that produces high quality images at higher resolutions (227x227) than previous generative models, and does so for all 1000 ImageNet categories. In addition, we provide a unified probabilistic interpretation of related activation maximization methods and call the general class of models "Plug and Play Generative Networks". PPGNs are composed of 1) a generator network G that is capable of drawing a wide range of image types and 2) a replaceable "condition" network C that tells the generator what to draw. We demonstrate the generation of images conditioned on a class (when C is an ImageNet or MIT Places classification network) and also conditioned on a caption (when C is an image captioning network). Our method also improves the state of the art of Multifaceted Feature Visualization, which generates the set of synthetic inputs that activate a neuron in order to better understand how deep neural networks operate. Finally, we show that our model performs reasonably well at the task of image inpainting. While image models are used in this paper, the approach is modality-agnostic and can be applied to many types of data.},
  urldate = {2017-03-09},
  date = {2016-11-30},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Nguyen, Anh and Yosinski, Jason and Bengio, Yoshua and Dosovitskiy, Alexey and Clune, Jeff},
  file = {/Users/fergalcotter/Dropbox/Papers/Nguyen et al_2016_Plug & Play Generative Networks.pdf;/Users/fergalcotter/Zotero/storage/99GT8GRH/1612.html}
}

@article{guberman_complex_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.09046},
  primaryClass = {cs},
  title = {On {{Complex Valued Convolutional Neural Networks}}},
  url = {http://arxiv.org/abs/1602.09046},
  abstract = {Convolutional neural networks (CNNs) are the cutting edge model for supervised machine learning in computer vision. In recent years CNNs have outperformed traditional approaches in many computer vision tasks such as object detection, image classification and face recognition. CNNs are vulnerable to overfitting, and a lot of research focuses on finding regularization methods to overcome it. One approach is designing task specific models based on prior knowledge. Several works have shown that properties of natural images can be easily captured using complex numbers. Motivated by these works, we present a variation of the CNN model with complex valued input and weights. We construct the complex model as a generalization of the real model. Lack of order over the complex field raises several difficulties both in the definition and in the training of the network. We address these issues and suggest possible solutions. The resulting model is shown to be a restricted form of a real valued CNN with twice the parameters. It is sensitive to phase structure, and we suggest it serves as a regularized model for problems where such structure is important. This suggestion is verified empirically by comparing the performance of a complex and a real network in the problem of cell detection. The two networks achieve comparable results, and although the complex model is hard to train, it is significantly less vulnerable to overfitting. We also demonstrate that the complex network detects meaningful phase structure in the data.},
  urldate = {2017-04-19},
  date = {2016-02-29},
  keywords = {Computer Science - Neural and Evolutionary Computing},
  author = {Guberman, Nitzan},
  file = {/Users/fergalcotter/Dropbox/Papers/Guberman_2016_On Complex Valued Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/QHNA49US/1602.html}
}

@article{kim_fully_2002,
  langid = {english},
  title = {Fully {{Complex Multi}}-{{Layer Perceptron Network}} for {{Nonlinear Signal Processing}}},
  volume = {32},
  issn = {0922-5773},
  url = {https://link.springer.com/article/10.1023/A:1016359216961},
  doi = {10.1023/A:1016359216961},
  abstract = {Designing a neural network (NN) to process complex-valued signals is a challenging task since a complex nonlinear activation function (AF) cannot be both analytic and bounded everywhere in the complex plane ℂ. To avoid this difficulty, ‘splitting’, i.e., using a pair of real sigmoidal functions for the real and imaginary components has been the traditional approach. However, this ‘ad hoc’ compromise to avoid the unbounded nature of nonlinear complex functions results in a nowhere analytic AF that performs the error back-propagation (BP) using the split derivatives of the real and imaginary components instead of relying on well-defined fully complex derivatives. In this paper, a fully complex multi-layer perceptron (MLP) structure that yields a simplified complex-valued back-propagation (BP) algorithm is presented. The simplified BP verifies that the fully complex BP weight update formula is the complex conjugate form of real BP formula and the split complex BP is a special case of the fully complex BP. This generalization is possible by employing elementary transcendental functions (ETFs) that are almost everywhere (a.e.) bounded and analytic in ℂ. The properties of fully complex MLP are investigated and the advantage of ETFs over split complex AF is shown in numerical examples where nonlinear magnitude and phase distortions of non-constant modulus modulated signals are successfully restored.},
  number = {1-2},
  journaltitle = {Journal of VLSI signal processing systems for signal, image and video technology},
  shortjournal = {The Journal of VLSI Signal Processing-Systems for Signal, Image, and Video Technology},
  urldate = {2017-04-19},
  date = {2002-08-01},
  pages = {29-43},
  author = {Kim, Taehwan and Adali, Tülay},
  file = {/Users/fergalcotter/Dropbox/Papers/Kim_Adali_2002_Fully Complex Multi-Layer Perceptron Network for Nonlinear Signal Processing.pdf;/Users/fergalcotter/Zotero/storage/RZK7F6Q2/A1016359216961.html}
}

@article{amodei_concrete_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1606.06565},
  primaryClass = {cs},
  title = {Concrete {{Problems}} in {{AI Safety}}},
  url = {http://arxiv.org/abs/1606.06565},
  abstract = {Rapid progress in machine learning and artificial intelligence (AI) has brought increasing attention to the potential impacts of AI technologies on society. In this paper we discuss one such potential impact: the problem of accidents in machine learning systems, defined as unintended and harmful behavior that may emerge from poor design of real-world AI systems. We present a list of five practical research problems related to accident risk, categorized according to whether the problem originates from having the wrong objective function ("avoiding side effects" and "avoiding reward hacking"), an objective function that is too expensive to evaluate frequently ("scalable supervision"), or undesirable behavior during the learning process ("safe exploration" and "distributional shift"). We review previous work in these areas as well as suggesting research directions with a focus on relevance to cutting-edge AI systems. Finally, we consider the high-level question of how to think most productively about the safety of forward-looking applications of AI.},
  urldate = {2017-04-20},
  date = {2016-06-21},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Learning},
  author = {Amodei, Dario and Olah, Chris and Steinhardt, Jacob and Christiano, Paul and Schulman, John and Mané, Dan},
  file = {/Users/fergalcotter/Dropbox/Papers/Amodei et al_2016_Concrete Problems in AI Safety.pdf;/Users/fergalcotter/Zotero/storage/ZXPG9DC6/1606.html}
}

@article{oyallon_building_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1703.01775},
  primaryClass = {cs},
  title = {Building a {{Regular Decision Boundary}} with {{Deep Networks}}},
  url = {http://arxiv.org/abs/1703.01775},
  abstract = {In this work, we build a generic architecture of Convolutional Neural Networks to discover empirical properties of neural networks. Our first contribution is to introduce a state-of-the-art framework that depends upon few hyper parameters and to study the network when we vary them. It has no max pooling, no biases, only 13 layers, is purely convolutional and yields up to 95.4\% and 79.6\% accuracy respectively on CIFAR10 and CIFAR100. We show that the nonlinearity of a deep network does not need to be continuous, non expansive or point-wise, to achieve good performance. We show that increasing the width of our network permits being competitive with very deep networks. Our second contribution is an analysis of the contraction and separation properties of this network. Indeed, a 1-nearest neighbor classifier applied on deep features progressively improves with depth, which indicates that the representation is progressively more regular. Besides, we defined and analyzed local support vectors that separate classes locally. All our experiments are reproducible and code is available online, based on TensorFlow.},
  urldate = {2017-04-25},
  date = {2017-03-06},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning},
  author = {Oyallon, Edouard},
  file = {/Users/fergalcotter/Dropbox/Papers/Oyallon_2017_Building a Regular Decision Boundary with Deep Networks.pdf;/Users/fergalcotter/Zotero/storage/FA9GF6IA/1703.html}
}

@article{jacobsen_multiscale_2017-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1703.04140},
  primaryClass = {cs, stat},
  title = {Multiscale {{Hierarchical Convolutional Networks}}},
  url = {http://arxiv.org/abs/1703.04140},
  abstract = {Deep neural network algorithms are difficult to analyze because they lack structure allowing to understand the properties of underlying transforms and invariants. Multiscale hierarchical convolutional networks are structured deep convolutional networks where layers are indexed by progressively higher dimensional attributes, which are learned from training data. Each new layer is computed with multidimensional convolutions along spatial and attribute variables. We introduce an efficient implementation of such networks where the dimensionality is progressively reduced by averaging intermediate layers along attribute indices. Hierarchical networks are tested on CIFAR image data bases where they obtain comparable precisions to state of the art networks, with much fewer parameters. We study some properties of the attributes learned from these databases.},
  urldate = {2017-04-25},
  date = {2017-03-12},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Jacobsen, Jörn-Henrik and Oyallon, Edouard and Mallat, Stéphane and Smeulders, Arnold W. M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Jacobsen et al_2017_Multiscale Hierarchical Convolutional Networks.pdf;/Users/fergalcotter/Zotero/storage/B33J9FK4/1703.html}
}

@inproceedings{oyallon_scaling_2017,
  archivePrefix = {arXiv},
%  eprinttype = {arxiv},
%%  eprint = {1703.08961},
  location = {{Venice, Italy}},
  title = {Scaling the {{Scattering Transform}}: {{Deep Hybrid Networks}}},
  url = {http://arxiv.org/abs/1703.08961},
  doi = {10.1109/ICCV.2017.599},
  shorttitle = {Scaling the {{Scattering Transform}}},
  abstract = {We use the scattering network as a generic and fixed ini-tialization of the first layers of a supervised hybrid deep network. We show that early layers do not necessarily need to be learned, providing the best results to-date with pre-defined representations while being competitive with Deep CNNs. Using a shallow cascade of 1 x 1 convolutions, which encodes scattering coefficients that correspond to spatial windows of very small sizes, permits to obtain AlexNet accuracy on the imagenet ILSVRC2012. We demonstrate that this local encoding explicitly learns invariance w.r.t. rotations. Combining scattering networks with a modern ResNet, we achieve a single-crop top 5 error of 11.4\% on imagenet ILSVRC2012, comparable to the Resnet-18 architecture, while utilizing only 10 layers. We also find that hybrid architectures can yield excellent performance in the small sample regime, exceeding their end-to-end counterparts, through their ability to incorporate geometrical priors. We demonstrate this on subsets of the CIFAR-10 dataset and on the STL-10 dataset.},
  eventtitle = {International {{Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {International {{Conference}} on {{Computer Vision}} ({{ICCV}})},
  publisher = {{IEEE}},
  urldate = {2017-04-25},
  date = {2017-10},
  pages = {5619-5628},
  keywords = {Computer architecture,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,convolution,Deep CNNs,Deep hybrid networks,Encoding,image coding,imagenet ILSVRC2012,local encoding,neural nets,Pipelines,Resnet-18 architecture,Scattering,scattering coefficients,scattering network,scattering transform,shallow cascade,supervised hybrid deep network,transforms,Wavelet transforms},
  author = {Oyallon, Edouard and Belilovsky, Eugene and Zagoruyko, Sergey},
  file = {/Users/fergalcotter/Dropbox/Papers/Oyallon et al_2017_Scaling the Scattering Transform.pdf;/Users/fergalcotter/Dropbox/Papers/Oyallon et al_2017_Scaling the Scattering Transform3.pdf;/Users/fergalcotter/Zotero/storage/AH2XRG5D/1703.html;/Users/fergalcotter/Zotero/storage/E79SPIZY/8237861.html}
}

@inproceedings{oyallon_hybrid_2017,
  title = {A {{Hybrid Network}}: {{Scattering}} and {{Convnet}}},
  date = {2017},
  author = {Oyallon, Edouard},
  file = {/Users/fergalcotter/Dropbox/Papers/Oyallon_2017_A Hybrid Network.pdf}
}

@article{georgiou_complex_1992,
  title = {Complex Domain Backpropagation},
  volume = {39},
  issn = {1057-7130},
  doi = {10.1109/82.142037},
  abstract = {The backpropagation algorithm is extended to complex domain backpropagation (CDBP) which can be used to train neural networks for which the inputs, weights, activation functions, and outputs are complex-valued. Previous derivations of CDBP were necessarily admitting activation functions that have singularities, which is highly undesirable. In the derivation, CDBP is derived so that that it accommodates classes of suitable activation functions. One such function is found and the circuit implementation of the corresponding neuron is given. CDBP hardware circuits can be used to process sinusoidal signals all at the same frequency (phasors)},
  number = {5},
  journaltitle = {IEEE Transactions on Circuits and Systems II: Analog and Digital Signal Processing},
  date = {1992-05},
  pages = {330-334},
  keywords = {Frequency,Learning systems,Neural networks,Neurons,signal processing,Backpropagation algorithms,Circuits,Feedforward systems,Least squares approximation,Neural network hardware,Nonhomogeneous media,activation functions,analogue computer circuits,complex domain backpropagation,computerised signal processing,hardware circuits,neural nets,neural network training,phasors,sinusoidal signals processing},
  author = {Georgiou, G. M. and Koutsougeras, C.},
  file = {/Users/fergalcotter/Dropbox/Papers/Georgiou_Koutsougeras_1992_Complex domain backpropagation.pdf;/Users/fergalcotter/Zotero/storage/G8XTCEFD/142037.html}
}

@article{ioannou_deep_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1605.06489},
  primaryClass = {cs},
  title = {Deep {{Roots}}: {{Improving CNN Efficiency}} with {{Hierarchical Filter Groups}}},
  url = {http://arxiv.org/abs/1605.06489},
  shorttitle = {Deep {{Roots}}},
  abstract = {We propose a new method for creating computationally efficient and compact convolutional neural networks (CNNs) using a novel sparse connection structure that resembles a tree root. This allows a significant reduction in computational cost and number of parameters compared to state-of-the-art deep CNNs, without compromising accuracy, by exploiting the sparsity of inter-layer filter dependencies. We validate our approach by using it to train more efficient variants of state-of-the-art CNN architectures, evaluated on the CIFAR10 and ILSVRC datasets. Our results show similar or higher accuracy than the baseline architectures with much less computation, as measured by CPU and GPU timings. For example, for ResNet 50, our model has 40\% fewer parameters, 45\% fewer floating point operations, and is 31\% (12\%) faster on a CPU (GPU). For the deeper ResNet 200 our model has 25\% fewer floating point operations and 44\% fewer parameters, while maintaining state-of-the-art accuracy. For GoogLeNet, our model has 7\% fewer parameters and is 21\% (16\%) faster on a CPU (GPU).},
  urldate = {2017-05-08},
  date = {2016-05-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Ioannou, Yani and Robertson, Duncan and Cipolla, Roberto and Criminisi, Antonio},
  file = {/Users/fergalcotter/Dropbox/Papers/Ioannou et al_2016_Deep Roots.pdf;/Users/fergalcotter/Zotero/storage/EDNGZH3R/1605.html}
}

@article{denil_predicting_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1306.0543},
  primaryClass = {cs, stat},
  title = {Predicting {{Parameters}} in {{Deep Learning}}},
  url = {http://arxiv.org/abs/1306.0543},
  abstract = {We demonstrate that there is significant redundancy in the parameterization of several deep learning models. Given only a few weight values for each feature it is possible to accurately predict the remaining values. Moreover, we show that not only can the parameter values be predicted, but many of them need not be learned at all. We train several different architectures by learning only a small number of weights and predicting the rest. In the best case we are able to predict more than 95\% of the weights of a network without any drop in accuracy.},
  urldate = {2017-05-08},
  date = {2013-06-03},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  author = {Denil, Misha and Shakibi, Babak and Dinh, Laurent and Ranzato, Marc'Aurelio and de Freitas, Nando},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Dropbox/Papers/Denil et al_2013_Predicting Parameters in Deep Learning.pdf;/Users/fergalcotter/Zotero/storage/F2RZBGXB/1306.html}
}

@inproceedings{singh_dual-tree_2017,
  archivePrefix = {arXiv},
%  eprinttype = {arxiv},
%  eprint = {1702.03267},
  location = {{New Orleans}},
  title = {Dual-{{Tree Wavelet Scattering Network}} with {{Parametric Log Transformation}} for {{Object Classification}}},
  url = {http://arxiv.org/abs/1702.03267},
  doi = {10.1109/ICASSP.2017.7952631},
  abstract = {We introduce a ScatterNet that uses a parametric log transformation with Dual-Tree complex wavelets to extract translation invariant representations from a multi-resolution image. The parametric transformation aids the OLS pruning algorithm by converting the skewed distributions into relatively mean-symmetric distributions while the Dual-Tree wavelets improve the computational efficiency of the network. The proposed network is shown to outperform Mallat's ScatterNet on two image datasets, both for classification accuracy and computational efficiency. The advantages of the proposed network over other supervised and some unsupervised methods are also presented using experiments performed on different training dataset sizes.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
  booktitle = {Proceedings of 2017 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
  publisher = {{IEEE}},
  urldate = {2017-05-22},
  date = {2017-03},
  pages = {2622-2626},
  keywords = {CIFAR,classification accuracy,computational efficiency improvement,Computer Science - Computer Vision and Pattern Recognition,Convolutional neural network,DTCWT,dual-tree complex wavelets,dual-tree wavelet scattering network,Feature extraction,image classification,image datasets,image representation,image resolution,mean-symmetric distributions,multiresolution image,Neural networks,object classification,OLS pruning algorithm,Orthogonal least squares,parametric log transformation,Personal area networks,Scattering,Scattering network,ScatterNet,skewed distributions,Smoothing methods,Training,translation invariant representations,trees (mathematics),wavelet transforms,Wavelet transforms},
  author = {Singh, Amarjot and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Singh_Kingsbury_2017_Dual-Tree Wavelet Scattering Network with Parametric Log Transformation for.pdf;/Users/fergalcotter/Dropbox/Papers/Singh_Kingsbury_2017_Dual-Tree wavelet scattering network with parametric log transformation for2.pdf;/Users/fergalcotter/Zotero/storage/57DR9Q3V/1702.html;/Users/fergalcotter/Zotero/storage/SJTAYMAY/7952631.html}
}

@article{arjovsky_towards_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1701.04862},
  primaryClass = {cs, stat},
  title = {Towards {{Principled Methods}} for {{Training Generative Adversarial Networks}}},
  url = {http://arxiv.org/abs/1701.04862},
  abstract = {The goal of this paper is not to introduce a single algorithm or method, but to make theoretical steps towards fully understanding the training dynamics of generative adversarial networks. In order to substantiate our theoretical analysis, we perform targeted experiments to verify our assumptions, illustrate our claims, and quantify the phenomena. This paper is divided into three sections. The first section introduces the problem at hand. The second section is dedicated to studying and proving rigorously the problems including instability and saturation that arize when training generative adversarial networks. The third section examines a practical and theoretically grounded direction towards solving these problems, while introducing new tools to study them.},
  urldate = {2017-05-25},
  date = {2017-01-17},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Arjovsky, Martin and Bottou, Léon},
  file = {/Users/fergalcotter/Dropbox/Papers/Arjovsky_Bottou_2017_Towards Principled Methods for Training Generative Adversarial Networks.pdf;/Users/fergalcotter/Zotero/storage/FJCV3I4R/1701.html}
}

@article{arjovsky_wasserstein_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1701.07875},
  primaryClass = {cs, stat},
  title = {Wasserstein {{GAN}}},
  url = {http://arxiv.org/abs/1701.07875},
  abstract = {We introduce a new algorithm named WGAN, an alternative to traditional GAN training. In this new model, we show that we can improve the stability of learning, get rid of problems like mode collapse, and provide meaningful learning curves useful for debugging and hyperparameter searches. Furthermore, we show that the corresponding optimization problem is sound, and provide extensive theoretical work highlighting the deep connections to other distances between distributions.},
  urldate = {2017-05-25},
  date = {2017-01-26},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Arjovsky, Martin and Chintala, Soumith and Bottou, Léon},
  file = {/Users/fergalcotter/Dropbox/Papers/Arjovsky et al_2017_Wasserstein GAN.pdf;/Users/fergalcotter/Zotero/storage/Q9JREMGA/1701.html}
}

@article{lai_giraffe:_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1509.01549},
  primaryClass = {cs},
  title = {Giraffe: {{Using Deep Reinforcement Learning}} to {{Play Chess}}},
  url = {http://arxiv.org/abs/1509.01549},
  shorttitle = {Giraffe},
  abstract = {This report presents Giraffe, a chess engine that uses self-play to discover all its domain-specific knowledge, with minimal hand-crafted knowledge given by the programmer. Unlike previous attempts using machine learning only to perform parameter-tuning on hand-crafted evaluation functions, Giraffe's learning system also performs automatic feature extraction and pattern recognition. The trained evaluation function performs comparably to the evaluation functions of state-of-the-art chess engines - all of which containing thousands of lines of carefully hand-crafted pattern recognizers, tuned over many years by both computer chess experts and human chess masters. Giraffe is the most successful attempt thus far at using end-to-end machine learning to play chess.},
  date = {2015-09-04},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Lai, Matthew},
  file = {/Users/fergalcotter/Dropbox/Papers/Lai_2015_Giraffe.pdf;/Users/fergalcotter/Zotero/storage/D4JH8HDP/1509.html}
}

@article{baker_designing_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1611.02167},
  primaryClass = {cs},
  title = {Designing {{Neural Network Architectures}} Using {{Reinforcement Learning}}},
  url = {http://arxiv.org/abs/1611.02167},
  abstract = {At present, designing convolutional neural network (CNN) architectures requires both human expertise and labor. New architectures are handcrafted by careful experimentation or modified from a handful of existing networks. We introduce MetaQNN, a meta-modeling algorithm based on reinforcement learning to automatically generate high-performing CNN architectures for a given learning task. The learning agent is trained to sequentially choose CNN layers using \$Q\$-learning with an \$\textbackslash{}epsilon\$-greedy exploration strategy and experience replay. The agent explores a large but finite space of possible architectures and iteratively discovers designs with improved performance on the learning task. On image classification benchmarks, the agent-designed networks (consisting of only standard convolution, pooling, and fully-connected layers) beat existing networks designed with the same layer types and are competitive against the state-of-the-art methods that use more complex layer types. We also outperform existing meta-modeling approaches for network design on image classification tasks.},
  date = {2016-11-07},
  keywords = {Computer Science - Learning},
  author = {Baker, Bowen and Gupta, Otkrist and Naik, Nikhil and Raskar, Ramesh},
  file = {/Users/fergalcotter/Dropbox/Papers/Baker et al_2016_Designing Neural Network Architectures using Reinforcement Learning.pdf;/Users/fergalcotter/Zotero/storage/5G2Q5XWE/1611.html}
}

@article{zoph_neural_2016,
  title = {Neural {{Architecture Search}} with {{Reinforcement Learning}}},
  url = {https://openreview.net/forum?id=r1Ue8Hcxg&noteId=r1Ue8Hcxg},
  urldate = {2017-06-26},
  date = {2016-11-04},
  author = {Zoph, Barret and Le, Quoc},
  file = {/Users/fergalcotter/Dropbox/Papers/Zoph_Le_2016_Neural Architecture Search with Reinforcement Learning.pdf;/Users/fergalcotter/Zotero/storage/4T2RJTJ8/forum.html}
}

@online{_why_????,
  title = {Why {{Use QuantLib}}?},
  url = {https://www.researchgate.net/publication/2941142_Why_Use_QuantLib},
  abstract = {Why Use QuantLib? on ResearchGate, the professional network for scientists.},
  journaltitle = {ResearchGate},
  urldate = {2017-06-17},
  file = {/Users/fergalcotter/Dropbox/Papers/Why Use QuantLib.pdf;/Users/fergalcotter/Zotero/storage/J2XM7MHD/2941142_Why_Use_QuantLib.html}
}

@article{klambauer_self-normalizing_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1706.02515},
  primaryClass = {cs, stat},
  title = {Self-{{Normalizing Neural Networks}}},
  url = {http://arxiv.org/abs/1706.02515},
  abstract = {Deep Learning has revolutionized vision via convolutional neural networks (CNNs) and natural language processing via recurrent neural networks (RNNs). However, success stories of Deep Learning with standard feed-forward neural networks (FNNs) are rare. FNNs that perform well are typically shallow and, therefore cannot exploit many levels of abstract representations. We introduce self-normalizing neural networks (SNNs) to enable high-level abstract representations. While batch normalization requires explicit normalization, neuron activations of SNNs automatically converge towards zero mean and unit variance. The activation function of SNNs are "scaled exponential linear units" (SELUs), which induce self-normalizing properties. Using the Banach fixed-point theorem, we prove that activations close to zero mean and unit variance that are propagated through many network layers will converge towards zero mean and unit variance -- even under the presence of noise and perturbations. This convergence property of SNNs allows to (1) train deep networks with many layers, (2) employ strong regularization, and (3) to make learning highly robust. Furthermore, for activations not close to unit variance, we prove an upper and lower bound on the variance, thus, vanishing and exploding gradients are impossible. We compared SNNs on (a) 121 tasks from the UCI machine learning repository, on (b) drug discovery benchmarks, and on (c) astronomy tasks with standard FNNs and other machine learning methods such as random forests and support vector machines. SNNs significantly outperformed all competing FNN methods at 121 UCI tasks, outperformed all competing methods at the Tox21 dataset, and set a new record at an astronomy data set. The winning SNN architectures are often very deep. Implementations are available at: github.com/bioinf-jku/SNNs.},
  date = {2017-06-08},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Klambauer, Günter and Unterthiner, Thomas and Mayr, Andreas and Hochreiter, Sepp},
  file = {/Users/fergalcotter/Dropbox/Papers/Klambauer et al_2017_Self-Normalizing Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/HZTD2N9Q/1706.html}
}

@inproceedings{zisserman_layer_????,
  title = {Layer {{Recurrent Neural Networks}}},
  author = {Zisserman, Andrew and Weidi, Xie and Noble, Alison},
  file = {/Users/fergalcotter/Zotero/storage/RB7F93CV/Zisserman et al. - Layer Recurrent Neural Networks.pdf}
}

@article{ren_faster_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1506.01497},
  primaryClass = {cs},
  title = {Faster {{R}}-{{CNN}}: {{Towards Real}}-{{Time Object Detection}} with {{Region Proposal Networks}}},
  url = {http://arxiv.org/abs/1506.01497},
  shorttitle = {Faster {{R}}-{{CNN}}},
  abstract = {State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available.},
  date = {2015-06-04},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
  file = {/Users/fergalcotter/Dropbox/Papers/Ren et al_2015_Faster R-CNN.pdf;/Users/fergalcotter/Zotero/storage/A5EUC87R/1506.html}
}

@article{girshick_fast_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1504.08083},
  primaryClass = {cs},
  title = {Fast {{R}}-{{CNN}}},
  url = {http://arxiv.org/abs/1504.08083},
  abstract = {This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. Fast R-CNN is implemented in Python and C++ (using Caffe) and is available under the open-source MIT License at https://github.com/rbgirshick/fast-rcnn.},
  date = {2015-04-30},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Girshick, Ross},
  file = {/Users/fergalcotter/Dropbox/Papers/Girshick_2015_Fast R-CNN.pdf;/Users/fergalcotter/Zotero/storage/DFZEJR47/1504.html}
}

@article{daubechies_factoring_1998,
  langid = {english},
  title = {Factoring Wavelet Transforms into Lifting Steps},
  volume = {4},
  issn = {1069-5869, 1531-5851},
  url = {https://link.springer.com/article/10.1007/BF02476026},
  doi = {10.1007/BF02476026},
  abstract = {This article is essentially tutorial in nature. We show how any discrete wavelet transform or two band subband filtering with finite filters can be decomposed into a finite sequence of simple filtering steps, which we call lifting steps but that are also known as ladder structures. This decomposition corresponds to a factorization of the polyphase matrix of the wavelet or subband filters into elementary matrices. That such a factorization is possible is well-known to algebraists (and expressed by the formulaSL(n;R[z, z−1])=E(n;R[z, z−1])); it is also used in linear systems theory in the electrical engineering community. We present here a self-contained derivation, building the decomposition from basic principles such as the Euclidean algorithm, with a focus on applying it to wavelet filtering. This factorization provides an alternative for the lattice factorization, with the advantage that it can also be used in the biorthogonal, i.e., non-unitary case. Like the lattice factorization, the decomposition presented here asymptotically reduces the computational complexity of the transform by a factor two. It has other applications, such as the possibility of defining a wavelet-like transform that maps integers to integers.},
  number = {3},
  journaltitle = {Journal of Fourier Analysis and Applications},
  shortjournal = {The Journal of Fourier Analysis and Applications},
  urldate = {2017-07-17},
  date = {1998-05-01},
  pages = {247-269},
  author = {Daubechies, Ingrid and Sweldens, Wim},
  file = {/Users/fergalcotter/Dropbox/Papers/Daubechies_Sweldens_1998_Factoring wavelet transforms into lifting steps.pdf;/Users/fergalcotter/Zotero/storage/6GWNKZMQ/BF02476026.html}
}

@article{hochreiter_long_1997,
  title = {Long {{Short}}-{{Term Memory}}},
  volume = {9},
  issn = {0899-7667},
  url = {http://dx.doi.org/10.1162/neco.1997.9.8.1735},
  doi = {10.1162/neco.1997.9.8.1735},
  abstract = {Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms.},
  number = {8},
  journaltitle = {Neural Comput.},
  date = {1997-11},
  pages = {1735--1780},
  author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
  file = {/Users/fergalcotter/Dropbox/Papers/Hochreiter_Schmidhuber_1997_Long Short-Term Memory.pdf}
}

@inproceedings{wang_beyond_2017,
  langid = {english},
  title = {Beyond {{Filters}}: {{Compact Feature Map}} for {{Portable Deep Model}}},
  url = {http://proceedings.mlr.press/v70/wang17m.html},
  shorttitle = {Beyond {{Filters}}},
  abstract = {Convolutional neural networks (CNNs) have shown extraordinary performance in a number of applications, but they are usually of heavy design for the accuracy reason. Beyond compressing the filters i...},
  eventtitle = {International {{Conference}} on {{Machine Learning}}},
  booktitle = {{{PMLR}}},
  urldate = {2017-07-29},
  date = {2017-07-17},
  pages = {3703-3711},
  author = {Wang, Yunhe and Xu, Chang and Xu, Chao and Tao, Dacheng},
  file = {/Users/fergalcotter/Dropbox/Papers/Wang et al_2017_Beyond Filters.pdf;/Users/fergalcotter/Zotero/storage/CP3W8MUA/wang17m.html}
}

@article{andra_vlsi_2002,
  title = {A {{VLSI}} Architecture for Lifting-Based Forward and Inverse Wavelet Transform},
  volume = {50},
  issn = {1053-587X},
  doi = {10.1109/78.992147},
  abstract = {We propose an architecture that performs the forward and inverse discrete wavelet transform (DWT) using a lifting-based scheme for the set of seven filters proposed in JPEG2000. The architecture consists of two row processors, two column processors, and two memory modules. Each processor contains two adders, one multiplier, and one shifter. The precision of the multipliers and adders has been determined using extensive simulation. Each memory module consists of four banks in order to support the high computational bandwidth. The architecture has been designed to generate an output every cycle for the JPEG2000 default filters. The schedules have been generated by hand and the corresponding timings listed. Finally, the architecture has been implemented in behavioral VHDL. The estimated area of the proposed architecture in 0.18-μ technology is 2.8 nun square, and the estimated frequency of operation is 200 MHz},
  number = {4},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {2002-04},
  pages = {966-977},
  keywords = {Bandwidth,Computational modeling,Computer architecture,Discrete wavelet transforms,Filters,channel bank filters,discrete wavelet transform,filter bank,wavelet transforms,Inverse problems,0.18 micron,200 MHz,Frequency estimation,JPEG2000 default filters,Matrix converters,Transform coding,VLSI,VLSI architecture,Very large scale integration,adders,behavioral VHDL,circuit CAD,column processors,digital signal processing chips,hardware description languages,integrated circuit design,lifting-based forward wavelet transform,lifting-based inverse wavelet transform,memory modules,multiplier precision,row processors,shifter,simulation,Image coding},
  author = {Andra, K. and Chakrabarti, C. and Acharya, T.},
  file = {/Users/fergalcotter/Dropbox/Papers/Andra et al_2002_A VLSI architecture for lifting-based forward and inverse wavelet transform.pdf;/Users/fergalcotter/Zotero/storage/4QRHCNWT/992147.html}
}

@article{taubman_jpeg2000:_2002,
  title = {{{JPEG2000}}: Standard for Interactive Imaging},
  volume = {90},
  issn = {0018-9219},
  doi = {10.1109/JPROC.2002.800725},
  shorttitle = {{{JPEG2000}}},
  abstract = {JPEG2000 is the latest image compression standard to emerge from the Joint Photographic Experts Group (JPEG) working under the auspices of the International Standards Organization. Although the new standard does offer superior compression performance to JPEG, JPEG2000 provides a whole new way of interacting with compressed imagery in a scalable and interoperable fashion. This paper provides a tutorial-style review of the new standard, explaining the technology on which it is based and drawing comparisons with JPEG and other compression standards. The paper also describes new work, exploiting the capabilities of JPEG2000 in client-server systems for efficient interactive browsing of images over the Internet.},
  number = {8},
  journaltitle = {Proceedings of the IEEE},
  date = {2002-08},
  pages = {1336-1357},
  keywords = {Image resolution,Streaming media,data compression,Bit rate,IEC standards,ISO standards,International Standards Organization,JPEG2000,Joint Photographic Experts Group,Scalability,Standards development,Standards organizations,Transform coding,client-server systems,image compression,interactive imaging,interoperable compression,review,scalable compression,Image coding},
  author = {Taubman, D. S. and Marcellin, M. W.},
  file = {/Users/fergalcotter/Dropbox/Papers/Taubman_Marcellin_2002_JPEG2000.pdf;/Users/fergalcotter/Zotero/storage/VKF3JQEV/1037564.html}
}

@article{sweldens_lifting_1998-1,
  title = {The {{Lifting Scheme}}: {{A Construction}} of {{Second Generation Wavelets}}},
  volume = {29},
  issn = {0036-1410},
  url = {http://epubs.siam.org/doi/abs/10.1137/S0036141095289051},
  doi = {10.1137/S0036141095289051},
  shorttitle = {The {{Lifting Scheme}}},
  abstract = {We present the lifting scheme, a simple construction of second generation wavelets; these are wavelets that are not necessarily translates and dilates of one fixed function. Such wavelets can be adapted to intervals, domains, surfaces, weights, and irregular samples. We show how the lifting scheme leads to a faster, in-place calculation of the wavelet transform. Several examples are included.},
  number = {2},
  journaltitle = {SIAM Journal on Mathematical Analysis},
  shortjournal = {SIAM J. Math. Anal.},
  date = {1998-03-01},
  pages = {511-546},
  author = {Sweldens, W.},
  file = {/Users/fergalcotter/Dropbox/Papers/Sweldens_1998_The Lifting Scheme_2.pdf;/Users/fergalcotter/Zotero/storage/3CHDSSRC/S0036141095289051.html}
}

@article{tay_flexible_1993,
  title = {Flexible Design of Multidimensional Perfect Reconstruction {{FIR}} 2-Band Filters Using Transformations of Variables},
  volume = {2},
  issn = {1057-7149},
  doi = {10.1109/83.242356},
  abstract = {An approach to designing multidimensional linear-phase FIR diamond subband filters having the perfect reconstruction property is presented. It is based on a transformation of variables technique and is equivalent to the generalized McClellan transformation. Methods for designing a whole class of transformation are given. The approach consists of two parts; design of the transformation and design of the 1-D filters. The use of Lagrange halfband filters to design the 1-D filters is discussed. The modification of a particular Lagrange halfband filter which gives a pair of simple 1-D filters that are almost similar to each other in their frequency characteristics but still form a perfect reconstruction pair is presented. The design technique is extended to other types of two-channel sampling lattice and subband shapes, in particular, the parallelogram and the diagonally quadrant subband cases. Several numerical design examples are presented to illustrate the flexibility of the design method},
  number = {4},
  journaltitle = {IEEE Transactions on Image Processing},
  date = {1993-10},
  pages = {466-480},
  keywords = {Frequency,Image reconstruction,Shape,filter bank,filtering and prediction theory,image processing,1-D filters,Design methodology,Finite impulse response filter,Lagrange halfband filters,Lagrangian functions,McClellan transformation,Multidimensional systems,Nonlinear filters,diagonally quadrant subband,linear-phase FIR diamond subband filters,multidimensional 2-band filters image processing,multidimensional digital filters,parallelogram,perfect reconstruction property,transformations of variables,two-channel sampling lattice,Image coding},
  author = {Tay, D. B. H. and Kingsbury, N. G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Tay_Kingsbury_1993_Flexible design of multidimensional perfect reconstruction FIR 2-band filters.pdf;/Users/fergalcotter/Zotero/storage/JNXMN3PF/242356.html}
}

@article{gatys_preserving_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1606.05897},
  primaryClass = {cs},
  title = {Preserving {{Color}} in {{Neural Artistic Style Transfer}}},
  url = {http://arxiv.org/abs/1606.05897},
  abstract = {This note presents an extension to the neural artistic style transfer algorithm (Gatys et al.). The original algorithm transforms an image to have the style of another given image. For example, a photograph can be transformed to have the style of a famous painting. Here we address a potential shortcoming of the original method: the algorithm transfers the colors of the original painting, which can alter the appearance of the scene in undesirable ways. We describe simple linear methods for transferring style while preserving colors.},
  date = {2016-06-19},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Gatys, Leon A. and Bethge, Matthias and Hertzmann, Aaron and Shechtman, Eli},
  file = {/Users/fergalcotter/Dropbox/Papers/Gatys et al_2016_Preserving Color in Neural Artistic Style Transfer.pdf;/Users/fergalcotter/Zotero/storage/JM3SW5ES/1606.html}
}

@article{ruder_artistic_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1604.08610},
  primaryClass = {cs},
  title = {Artistic Style Transfer for Videos},
  volume = {9796},
  url = {http://arxiv.org/abs/1604.08610},
  doi = {10.1007/978-3-319-45886-1_3},
  abstract = {In the past, manually re-drawing an image in a certain artistic style required a professional artist and a long time. Doing this for a video sequence single-handed was beyond imagination. Nowadays computers provide new possibilities. We present an approach that transfers the style from one image (for example, a painting) to a whole video sequence. We make use of recent advances in style transfer in still images and propose new initializations and loss functions applicable to videos. This allows us to generate consistent and stable stylized video sequences, even in cases with large motion and strong occlusion. We show that the proposed method clearly outperforms simpler baselines both qualitatively and quantitatively.},
  date = {2016},
  pages = {26-36},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Ruder, Manuel and Dosovitskiy, Alexey and Brox, Thomas},
  file = {/Users/fergalcotter/Dropbox/Papers/Ruder et al_2016_Artistic style transfer for videos.pdf;/Users/fergalcotter/Zotero/storage/K5A9MV83/1604.html}
}

@article{olhede_monogenic_2009,
  title = {The {{Monogenic Wavelet Transform}}},
  volume = {57},
  issn = {1053-587X},
  doi = {10.1109/TSP.2009.2023397},
  abstract = {This paper extends the 1-D analytic wavelet transform to the 2-D monogenic wavelet transform. The transformation requires care in its specification to ensure suitable transform coefficients are calculated, and it is constructed so that the wavelet transform may be considered as both local and monogenic. This is consistent with defining the transform as a real wavelet transform of a monogenic signal in analogy with the analytic wavelet transform. Classes of monogenic wavelets are proposed with suitable local properties. It is shown that the monogenic wavelet annihilates anti-monogenic signals, that the monogenic wavelet transform is phase-shift covariant and that the transform magnitude is phase-shift invariant. A simple form for the magnitude and orientation of the isotropic transform coefficients of a unidirectional signal when observed in a rotated frame of reference is derived. The monogenic wavelet ridges of local plane waves are given.},
  number = {9},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {2009-09},
  pages = {3426-3441},
  keywords = {Hilbert transforms,signal processing,wavelet transforms,1D analytic wavelet transform,2D monogenic wavelet transform,Analytic signal,Hilbert transform,Riesz transform,analytic wavelet transform,isotropic transform coefficient,local plane wave,monogenic signal,phase-shift covariant,phase-shift invariant,unidirectional signal},
  author = {Olhede, S. C. and Metikas, G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Olhede_Metikas_2009_The Monogenic Wavelet Transform.pdf;/Users/fergalcotter/Zotero/storage/FZQPXX2B/4956988.html}
}

@online{_monogenic_????,
  title = {The {{Monogenic Riesz}}-{{Laplace Wavelet Transform}} ({{PDF Download Available}})},
  url = {https://www.researchgate.net/publication/229149145_The_Monogenic_Riesz-Laplace_Wavelet_Transform},
  abstract = {Official Full-Text Paper (PDF): The Monogenic Riesz-Laplace Wavelet Transform},
  journaltitle = {ResearchGate},
  urldate = {2017-08-18},
  file = {/Users/fergalcotter/Dropbox/Papers/The Monogenic Riesz-Laplace Wavelet Transform (PDF Download Available).pdf;/Users/fergalcotter/Zotero/storage/TGI8T66R/229149145_The_Monogenic_Riesz-Laplace_Wavelet_Transform.html}
}

@online{_sleep_????,
  title = {Sleep {{Disorders}} and {{Gastroesophageal Reflux Disease}} ({{GERD}}) - {{Full Text View}} - {{ClinicalTrials}}.Gov},
  url = {https://clinicaltrials.gov/ct/show/NCT00287391?order=1},
  urldate = {2017-08-29}
}

@article{russ_druggable_2005,
  title = {The Druggable Genome: An Update},
  volume = {10},
  issn = {1359-6446},
  url = {http://www.sciencedirect.com/science/article/pii/S1359644605036664},
  doi = {10.1016/S1359-6446(05)03666-4},
  shorttitle = {The Druggable Genome},
  number = {23},
  journaltitle = {Drug Discovery Today},
  shortjournal = {Drug Discovery Today},
  date = {2005-12-01},
  pages = {1607-1610},
  keywords = {Genome,druggability,protein prediction,sequence homology,sequencing},
  author = {Russ, Andreas P. and Lampel, Stefan},
  file = {/Users/fergalcotter/Dropbox/Papers/Russ_Lampel_2005_The druggable genome.pdf;/Users/fergalcotter/Zotero/storage/C6S7AF73/S1359644605036664.html}
}

@article{rask-andersen_trends_2011,
  langid = {english},
  title = {Trends in the Exploitation of Novel Drug Targets},
  volume = {10},
  issn = {1474-1776},
  url = {https://www.nature.com/nrd/journal/v10/n8/full/nrd3478.html},
  doi = {10.1038/nrd3478},
  abstract = {The discovery and exploitation of new drug targets is a key focus for both the pharmaceutical industry and academic biomedical research. To provide an insight into trends in the exploitation of new drug targets, we have analysed the drugs that were approved by the US Food and Drug Administration during the past three decades and examined the interactions of these drugs with therapeutic targets that are encoded by the human genome, using the DrugBank database and extensive manual curation. We have identified 435 effect-mediating drug targets in the human genome, which are modulated by 989 unique drugs, through 2,242 drug–target interactions. We also analyse trends in the introduction of drugs that modulate previously unexploited targets, and discuss the network pharmacology of the drugs in our data set.},
  number = {8},
  journaltitle = {Nature Reviews Drug Discovery},
  shortjournal = {Nat Rev Drug Discov},
  urldate = {2017-08-29},
  date = {2011-08},
  pages = {579-590},
  author = {Rask-Andersen, Mathias and Almén, Markus Sällman and Schiöth, Helgi B.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rask-Andersen et al_2011_Trends in the exploitation of novel drug targets.pdf;/Users/fergalcotter/Zotero/storage/7X5NRKHP/nrd3478.html}
}

@incollection{national_library_of_medicine_clinicaltrials.gov_2000,
  title = {{{ClinicalTrials}}.Gov},
  abstract = {Sleep disorders and
gastroesophageal reflux disease (GERD);},
  date = {2000-02-29},
  author = {{National Library of Medicine}}
}

@article{finan_druggable_2017,
  langid = {english},
  title = {The Druggable Genome and Support for Target Identification and Validation in Drug Development},
  volume = {9},
  issn = {1946-6234, 1946-6242},
  url = {http://stm.sciencemag.org/content/9/383/eaag1166},
  doi = {10.1126/scitranslmed.aag1166},
  abstract = {An organized way to drug the genome
Many drugs that are already approved for specific diseases have known protein targets, which may be relevant for other disease types as well. In addition, a systematic way of identifying druggable genes in various diseases should help streamline the process of developing new drugs for these targets, even if no specific drugs are available for them yet. Finan et al. designed a computational approach to do this, combining data from numerous existing genome-wide association studies to identify druggable proteins, connect them with known drugs where available, and facilitate the design of new targeted therapeutics.
Target identification (determining the correct drug targets for a disease) and target validation (demonstrating an effect of target perturbation on disease biomarkers and disease end points) are important steps in drug development. Clinically relevant associations of variants in genes encoding drug targets model the effect of modifying the same targets pharmacologically. To delineate drug development (including repurposing) opportunities arising from this paradigm, we connected complex disease- and biomarker-associated loci from genome-wide association studies to an updated set of genes encoding druggable human proteins, to agents with bioactivity against these targets, and, where there were licensed drugs, to clinical indications. We used this set of genes to inform the design of a new genotyping array, which will enable association studies of druggable genes for drug target selection and validation in human disease.
The druggable genome and genome-wide association study data reveal new drug development and repurposing opportunities.
The druggable genome and genome-wide association study data reveal new drug development and repurposing opportunities.},
  number = {383},
  journaltitle = {Science Translational Medicine},
  urldate = {2017-08-29},
  date = {2017-03-29},
  pages = {eaag1166},
  author = {Finan, Chris and Gaulton, Anna and Kruger, Felix A. and Lumbers, R. Thomas and Shah, Tina and Engmann, Jorgen and Galver, Luana and Kelley, Ryan and Karlsson, Anneli and Santos, Rita and Overington, John P. and Hingorani, Aroon D. and Casas, Juan P.},
  file = {/Users/fergalcotter/Dropbox/Papers/Finan et al_2017_The druggable genome and support for target identification and validation in.pdf;/Users/fergalcotter/Zotero/storage/PI7CGGS4/eaag1166.html},
  eprinttype = {pmid},
  eprint = {28356508}
}

@article{wagner_dgidb_2016,
  title = {{{DGIdb}} 2.0: Mining Clinically Relevant Drug–Gene Interactions},
  volume = {44},
  issn = {0305-1048},
  url = {https://academic.oup.com/nar/article/44/D1/D1036/2502659/DGIdb-2-0-mining-clinically-relevant-drug-gene},
  doi = {10.1093/nar/gkv1165},
  shorttitle = {{{DGIdb}} 2.0},
  abstract = {The Drug–Gene Interaction Database (DGIdb, www.dgidb.org) is a web resource that consolidates disparate data sources describing drug–gene interactions and gene druggability. It provides an intuitive graphical user interface and a documented application programming interface (API) for querying these data. DGIdb was assembled through an extensive manual curation effort, reflecting the combined information of twenty-seven sources. For DGIdb 2.0, substantial updates have been made to increase content and improve its usefulness as a resource for mining clinically actionable drug targets. Specifically, nine new sources of drug–gene interactions have been added, including seven resources specifically focused on interactions linked to clinical trials. These additions have more than doubled the overall count of drug–gene interactions. The total number of druggable gene claims has also increased by 30\%. Importantly, a majority of the unrestricted, publicly-accessible sources used in DGIdb are now automatically updated on a weekly basis, providing the most current information for these sources. Finally, a new web view and API have been developed to allow searching for interactions by drug identifiers to complement existing gene-based search functionality. With these updates, DGIdb represents a comprehensive and user friendly tool for mining the druggable genome for precision medicine hypothesis generation.},
  number = {D1},
  journaltitle = {Nucleic Acids Research},
  shortjournal = {Nucleic Acids Res},
  urldate = {2017-08-29},
  date = {2016-01-04},
  pages = {D1036-D1044},
  author = {Wagner, Alex H. and Coffman, Adam C. and Ainscough, Benjamin J. and Spies, Nicholas C. and Skidmore, Zachary L. and Campbell, Katie M. and Krysiak, Kilannin and Pan, Deng and McMichael, Joshua F. and Eldred, James M. and Walker, Jason R. and Wilson, Richard K. and Mardis, Elaine R. and Griffith, Malachi and Griffith, Obi L.},
  file = {/Users/fergalcotter/Dropbox/Papers/Wagner et al_2016_DGIdb 2.pdf;/Users/fergalcotter/Zotero/storage/73IGPDJV/DGIdb-2-0-mining-clinically-relevant-drug-gene.html}
}

@article{hopkins_druggable_2002,
  langid = {english},
  title = {The Druggable Genome},
  volume = {1},
  issn = {1474-1776},
  url = {http://www.nature.com/nrd/journal/v1/n9/full/nrd892.html?foxtrotcallback=true},
  doi = {10.1038/nrd892},
  abstract = {An assessment of the number of molecular targets that represent an opportunity for therapeutic intervention is crucial to the development of post-genomic research strategies within the pharmaceutical industry. Now that we know the size of the human genome, it is interesting to consider just how many molecular targets this opportunity represents. We start from the position that we understand the properties that are required for a good drug, and therefore must be able to understand what makes a good drug target.},
  number = {9},
  journaltitle = {Nature Reviews Drug Discovery},
  shortjournal = {Nat Rev Drug Discov},
  urldate = {2017-08-29},
  date = {2002-09},
  pages = {727-730},
  author = {Hopkins, Andrew L. and Groom, Colin R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Hopkins_Groom_2002_The druggable genome.pdf;/Users/fergalcotter/Zotero/storage/EBKSXISN/nrd892.html}
}

@article{simon_tabulated_2014,
  title = {A {{Tabulated Summary}} of {{Targeted}} and {{Biologic Therapies}} for {{Non}}–{{Small}}-{{Cell Lung Cancer}}},
  volume = {15},
  issn = {1525-7304},
  url = {http://www.sciencedirect.com/science/article/pii/S1525730413002350},
  doi = {10.1016/j.cllc.2013.11.009},
  abstract = {The current pace of development of targeted agents in lung cancer is unprecedented. This rapid pace of development is facilitated by the identification of novel targets, development of new ways of inhibiting older and more familiar targets and some very innovative therapeutic engineering that allows us to inhibit multiple targets simultaneously. In this tabulated summary of over 320 targeted therapies currently in practice and in clinical trials for patients with lung cancer are listed. Given that the information included is constantly changing, the readers are encouraged to ascertain the current status of the ongoing clinical trials by checking the clinicaltrials.gov website. To~facilitate this, a hyperlink for each agent is inserted in the left hand column of this reference tool. Compounds in pre-clinical development that have not yet entered clinical trials are not listed. Target therapies that are in clinical development but not enrolling lung cancer patients are also not included. Save for these exceptions the list is intended to be comprehensive. In conclusion, there are a plethora of novel agents currently in development for lung cancer. The emergence of these agents offers hope to a group of patients for whom progress has been slow until now. However dramatic improvements in survival have already been made in specific subsets of patients and this pace of advancements is only expected to accelerate dramatically for the foreseeable future.},
  number = {1},
  journaltitle = {Clinical Lung Cancer},
  shortjournal = {Clinical Lung Cancer},
  date = {2014-01-01},
  pages = {21-51},
  author = {Simon, George R. and Somaiah, Neeta},
  file = {/Users/fergalcotter/Dropbox/Papers/Simon_Somaiah_2014_A Tabulated Summary of Targeted and Biologic Therapies for Non–Small-Cell Lung.pdf;/Users/fergalcotter/Zotero/storage/M6WBHIZW/S1525730413002350.html}
}

@article{law_drugbank_2014,
  title = {{{DrugBank}} 4.0: Shedding New Light on Drug Metabolism},
  volume = {42},
  issn = {0305-1048},
  url = {https://academic.oup.com/nar/article/42/D1/D1091/1047263/DrugBank-4-0-shedding-new-light-on-drug-metabolism},
  doi = {10.1093/nar/gkt1068},
  shorttitle = {{{DrugBank}} 4.0},
  abstract = {DrugBank (http://www.drugbank.ca) is a comprehensive online database containing extensive biochemical and pharmacological information about drugs, their mechanisms and their targets. Since it was first described in 2006, DrugBank has rapidly evolved, both in response to user requests and in response to changing trends in drug research and development. Previous versions of DrugBank have been widely used to facilitate drug and in silico drug target discovery. The latest update, DrugBank 4.0, has been further expanded to contain data on drug metabolism, absorption, distribution, metabolism, excretion and toxicity (ADMET) and other kinds of quantitative structure activity relationships (QSAR) information. These enhancements are intended to facilitate research in xenobiotic metabolism (both prediction and characterization), pharmacokinetics, pharmacodynamics and drug design/discovery. For this release, \&gt;1200 drug metabolites (including their structures, names, activity, abundance and other detailed data) have been added along with \&gt;1300 drug metabolism reactions (including metabolizing enzymes and reaction types) and dozens of drug metabolism pathways. Another 30 predicted or measured ADMET parameters have been added to each DrugCard, bringing the average number of quantitative ADMET values for Food and Drug Administration-approved drugs close to 40. Referential nuclear magnetic resonance and MS spectra have been added for almost 400 drugs as well as spectral and mass matching tools to facilitate compound identification. This expanded collection of drug information is complemented by a number of new or improved search tools, including one that provides a simple analyses of drug–target, –enzyme and –transporter associations to provide insight on drug–drug interactions.},
  number = {D1},
  journaltitle = {Nucleic Acids Research},
  shortjournal = {Nucleic Acids Res},
  urldate = {2017-08-29},
  date = {2014-01-01},
  pages = {D1091-D1097},
  author = {Law, Vivian and Knox, Craig and Djoumbou, Yannick and Jewison, Tim and Guo, An Chi and Liu, Yifeng and Maciejewski, Adam and Arndt, David and Wilson, Michael and Neveu, Vanessa and Tang, Alexandra and Gabriel, Geraldine and Ly, Carol and Adamjee, Sakina and Dame, Zerihun T. and Han, Beomsoo and Zhou, You and Wishart, David S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Law et al_2014_DrugBank 4.pdf;/Users/fergalcotter/Zotero/storage/S5ZEUAZA/gkt1068.html}
}

@article{jing_neural_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.04058},
  primaryClass = {cs},
  title = {Neural {{Style Transfer}}: {{A Review}}},
  url = {http://arxiv.org/abs/1705.04058},
  shorttitle = {Neural {{Style Transfer}}},
  abstract = {The recent work of Gatys et al. demonstrated the power of Convolutional Neural Networks (CNN) in creating artistic fantastic imagery by separating and recombing the image content and style. This process of using CNN to migrate the semantic content of one image to different styles is referred to as Neural Style Transfer. Since then, Neural Style Transfer has become a trending topic both in academic literature and industrial applications. It is receiving increasing attention from computer vision researchers and several methods are proposed to either improve or extend the original neural algorithm proposed by Gatys et al. However, there is no comprehensive survey presenting and summarizing recent Neural Style Transfer literature. This review aims to provide an overview of the current progress towards Neural Style Transfer, as well as discussing its various applications and open problems for future research.},
  date = {2017-05-11},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Jing, Yongcheng and Yang, Yezhou and Feng, Zunlei and Ye, Jingwen and Song, Mingli},
  file = {/Users/fergalcotter/Dropbox/Papers/Jing et al_2017_Neural Style Transfer.pdf;/Users/fergalcotter/Zotero/storage/4BJI25FG/1705.html}
}

@article{nikulin_exploring_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.07188},
  primaryClass = {cs},
  title = {Exploring the {{Neural Algorithm}} of {{Artistic Style}}},
  url = {http://arxiv.org/abs/1602.07188},
  abstract = {We explore the method of style transfer presented in the article "A Neural Algorithm of Artistic Style" by Leon A. Gatys, Alexander S. Ecker and Matthias Bethge (arXiv:1508.06576). We first demonstrate the power of the suggested style space on a few examples. We then vary different hyper-parameters and program properties that were not discussed in the original paper, among which are the recognition network used, starting point of the gradient descent and different ways to partition style and content layers. We also give a brief comparison of some of the existing algorithm implementations and deep learning frameworks used. To study the style space further we attempt to generate synthetic images by maximizing a single entry in one of the Gram matrices \$\textbackslash{}mathcal\{G\}\_l\$ and some interesting results are observed. Next, we try to mimic the sparsity and intensity distribution of Gram matrices obtained from a real painting and generate more complex textures. Finally, we propose two new style representations built on top of network's features and discuss how one could be used to achieve local and potentially content-aware style transfer.},
  date = {2016-02-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Nikulin, Yaroslav and Novak, Roman},
  file = {/Users/fergalcotter/Dropbox/Papers/Nikulin_Novak_2016_Exploring the Neural Algorithm of Artistic Style_2.pdf;/Users/fergalcotter/Zotero/storage/EPPJ7RRT/1602.html}
}

@article{li_combining_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1601.04589},
  primaryClass = {cs},
  title = {Combining {{Markov Random Fields}} and {{Convolutional Neural Networks}} for {{Image Synthesis}}},
  url = {http://arxiv.org/abs/1601.04589},
  abstract = {This paper studies a combination of generative Markov random field (MRF) models and discriminatively trained deep convolutional neural networks (dCNNs) for synthesizing 2D images. The generative MRF acts on higher-levels of a dCNN feature pyramid, controling the image layout at an abstract level. We apply the method to both photographic and non-photo-realistic (artwork) synthesis tasks. The MRF regularizer prevents over-excitation artifacts and reduces implausible feature mixtures common to previous dCNN inversion approaches, permitting synthezing photographic content with increased visual plausibility. Unlike standard MRF-based texture synthesis, the combined system can both match and adapt local features with considerable variability, yielding results far out of reach of classic generative MRF methods.},
  date = {2016-01-18},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Li, Chuan and Wand, Michael},
  file = {/Users/fergalcotter/Dropbox/Papers/Li_Wand_2016_Combining Markov Random Fields and Convolutional Neural Networks for Image.pdf;/Users/fergalcotter/Zotero/storage/PQXFT3KX/1601.html}
}

@article{champandard_semantic_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1603.01768},
  primaryClass = {cs},
  title = {Semantic {{Style Transfer}} and {{Turning Two}}-{{Bit Doodles}} into {{Fine Artworks}}},
  url = {http://arxiv.org/abs/1603.01768},
  abstract = {Convolutional neural networks (CNNs) have proven highly effective at image synthesis and style transfer. For most users, however, using them as tools can be a challenging task due to their unpredictable behavior that goes against common intuitions. This paper introduces a novel concept to augment such generative architectures with semantic annotations, either by manually authoring pixel labels or using existing solutions for semantic segmentation. The result is a content-aware generative algorithm that offers meaningful control over the outcome. Thus, we increase the quality of images generated by avoiding common glitches, make the results look significantly more plausible, and extend the functional range of these algorithms---whether for portraits or landscapes, etc. Applications include semantic style transfer and turning doodles with few colors into masterful paintings!},
  date = {2016-03-05},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Champandard, Alex J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Champandard_2016_Semantic Style Transfer and Turning Two-Bit Doodles into Fine Artworks.pdf;/Users/fergalcotter/Zotero/storage/4M788N8Q/1603.html}
}

@article{li_demystifying_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1701.01036},
  primaryClass = {cs},
  title = {Demystifying {{Neural Style Transfer}}},
  url = {http://arxiv.org/abs/1701.01036},
  abstract = {Neural Style Transfer has recently demonstrated very exciting results which catches eyes in both academia and industry. Despite the amazing results, the principle of neural style transfer, especially why the Gram matrices could represent style remains unclear. In this paper, we propose a novel interpretation of neural style transfer by treating it as a domain adaptation problem. Specifically, we theoretically show that matching the Gram matrices of feature maps is equivalent to minimize the Maximum Mean Discrepancy (MMD) with the second order polynomial kernel. Thus, we argue that the essence of neural style transfer is to match the feature distributions between the style images and the generated images. To further support our standpoint, we experiment with several other distribution alignment methods, and achieve appealing results. We believe this novel interpretation connects these two important research fields, and could enlighten future researches.},
  date = {2017-01-04},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Li, Yanghao and Wang, Naiyan and Liu, Jiaying and Hou, Xiaodi},
  file = {/Users/fergalcotter/Dropbox/Papers/Li et al_2017_Demystifying Neural Style Transfer.pdf;/Users/fergalcotter/Zotero/storage/GVMZUKUM/1701.html}
}

@article{risser_stable_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1701.08893},
  primaryClass = {cs},
  title = {Stable and {{Controllable Neural Texture Synthesis}} and {{Style Transfer Using Histogram Losses}}},
  url = {http://arxiv.org/abs/1701.08893},
  abstract = {Recently, methods have been proposed that perform texture synthesis and style transfer by using convolutional neural networks (e.g. Gatys et al. [2015,2016]). These methods are exciting because they can in some cases create results with state-of-the-art quality. However, in this paper, we show these methods also have limitations in texture quality, stability, requisite parameter tuning, and lack of user controls. This paper presents a multiscale synthesis pipeline based on convolutional neural networks that ameliorates these issues. We first give a mathematical explanation of the source of instabilities in many previous approaches. We then improve these instabilities by using histogram losses to synthesize textures that better statistically match the exemplar. We also show how to integrate localized style losses in our multiscale framework. These losses can improve the quality of large features, improve the separation of content and style, and offer artistic controls such as paint by numbers. We demonstrate that our approach offers improved quality, convergence in fewer iterations, and more stability over the optimization.},
  date = {2017-01-30},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Graphics,Computer Science - Neural and Evolutionary Computing},
  author = {Risser, Eric and Wilmot, Pierre and Barnes, Connelly},
  file = {/Users/fergalcotter/Dropbox/Papers/Risser et al_2017_Stable and Controllable Neural Texture Synthesis and Style Transfer Using.pdf;/Users/fergalcotter/Zotero/storage/F8387ZAZ/1701.html}
}

@article{gatys_controlling_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1611.07865},
  primaryClass = {cs},
  title = {Controlling {{Perceptual Factors}} in {{Neural Style Transfer}}},
  url = {http://arxiv.org/abs/1611.07865},
  abstract = {Neural Style Transfer has shown very exciting results enabling new forms of image manipulation. Here we extend the existing method to introduce control over spatial location, colour information and across spatial scale. We demonstrate how this enhances the method by allowing high-resolution controlled stylisation and helps to alleviate common failure cases such as applying ground textures to sky regions. Furthermore, by decomposing style into these perceptual factors we enable the combination of style information from multiple sources to generate new, perceptually appealing styles from existing ones. We also describe how these methods can be used to more efficiently produce large size, high-quality stylisation. Finally we show how the introduced control measures can be applied in recent methods for Fast Neural Style Transfer.},
  date = {2016-11-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Gatys, Leon A. and Ecker, Alexander S. and Bethge, Matthias and Hertzmann, Aaron and Shechtman, Eli},
  file = {/Users/fergalcotter/Dropbox/Papers/Gatys et al_2016_Controlling Perceptual Factors in Neural Style Transfer.pdf;/Users/fergalcotter/Zotero/storage/XHESX4Z9/1611.html}
}

@article{novak_improving_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1605.04603},
  primaryClass = {cs},
  title = {Improving the {{Neural Algorithm}} of {{Artistic Style}}},
  url = {http://arxiv.org/abs/1605.04603},
  abstract = {In this work we investigate different avenues of improving the Neural Algorithm of Artistic Style (by Leon A. Gatys, Alexander S. Ecker and Matthias Bethge, arXiv:1508.06576). While showing great results when transferring homogeneous and repetitive patterns, the original style representation often fails to capture more complex properties, like having separate styles of foreground and background. This leads to visual artifacts and undesirable textures appearing in unexpected regions when performing style transfer. We tackle this issue with a variety of approaches, mostly by modifying the style representation in order for it to capture more information and impose a tighter constraint on the style transfer result. In our experiments, we subjectively evaluate our best method as producing from barely noticeable to significant improvements in the quality of style transfer.},
  date = {2016-05-15},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Novak, Roman and Nikulin, Yaroslav},
  file = {/Users/fergalcotter/Dropbox/Papers/Novak_Nikulin_2016_Improving the Neural Algorithm of Artistic Style.pdf;/Users/fergalcotter/Zotero/storage/FM46E666/1605.html}
}

@book{Goodfellow-et-al-2016,
  title = {Deep {{Learning}}},
  publisher = {{MIT Press}},
  date = {2016},
  author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
  note = {http://www.deeplearningbook.org}
}

@online{_introduction_????-1,
  title = {An {{Introduction}} to {{Computational Learning Theory}}},
  url = {https://mitpress.mit.edu/books/introduction-computational-learning-theory},
  abstract = {Emphasizing issues of computational efficiency, Michael Kearns and Umesh Vazirani introduce a number of central topics in computational learning theory for researchers and students in artificial intelligence, neural networks, theoretical computer science, and statistics.},
  journaltitle = {MIT Press},
  urldate = {2017-09-04},
  file = {/Users/fergalcotter/Zotero/storage/KBUPS9TC/introduction-computational-learning-theory.html}
}

@book{barber_bayesian_2012,
  location = {{New York, NY, USA}},
  title = {Bayesian {{Reasoning}} and {{Machine Learning}}},
  isbn = {978-0-521-51814-7},
  abstract = {Machine learning methods extract value from vast data sets quickly and with modest resources. They are established tools in a wide range of industrial applications, including search engines, DNA sequencing, stock market analysis, and robot locomotion, and their use is spreading rapidly. People who know the methods have their choice of rewarding jobs. This hands-on text opens these opportunities to computer science students with modest mathematical backgrounds. It is designed for final-year undergraduates and master's students with limited background in linear algebra and calculus. Comprehensive and coherent, it develops everything from basic reasoning to advanced techniques within the framework of graphical models. Students learn more than a menu of techniques, they develop analytical and problem-solving skills that equip them for the real world. Numerous examples and exercises, both computer based and theoretical, are included in every chapter. Resources for students and instructors, including a MATLAB toolbox, are available online.},
  publisher = {{Cambridge University Press}},
  date = {2012},
  author = {Barber, David},
  file = {/Users/fergalcotter/Dropbox/Papers/Barber_2012_Bayesian Reasoning and Machine Learning.pdf}
}

@book{kearns_introduction_1994,
  langid = {english},
  title = {An {{Introduction}} to {{Computational Learning Theory}}},
  isbn = {978-0-262-11193-5},
  abstract = {Emphasizing issues of computational efficiency, Michael Kearns and Umesh Vazirani introduce a number of central topics in computational learning theory for researchers and students in artificial intelligence, neural networks, theoretical computer science, and statistics.Computational learning theory is a new and rapidly expanding area of research that examines formal models of induction with the goals of discovering the common methods underlying efficient learning algorithms and identifying the computational impediments to learning.Each topic in the book has been chosen to elucidate a general principle, which is explored in a precise formal setting. Intuition has been emphasized in the presentation to make the material accessible to the nontheoretician while still providing precise arguments for the specialist. This balance is the result of new proofs of established theorems, and new presentations of the standard proofs.The topics covered include the motivation, definitions, and fundamental results, both positive and negative, for the widely studied L. G. Valiant model of Probably Approximately Correct Learning; Occam's Razor, which formalizes a relationship between learning and data compression; the Vapnik-Chervonenkis dimension; the equivalence of weak and strong learning; efficient learning in the presence of noise by the method of statistical queries; relationships between learning and cryptography, and the resulting computational limitations on efficient learning; reducibility between learning problems; and algorithms for learning finite automata from active experimentation.},
  pagetotal = {230},
  publisher = {{MIT Press}},
  date = {1994},
  keywords = {Computers / Computer Science,Computers / Intelligence (AI) & Semantics,Education / General},
  author = {Kearns, Michael J. and Vazirani, Umesh Virkumar},
  file = {/Users/fergalcotter/Dropbox/Papers/Kearns_Vazirani_1994_An Introduction to Computational Learning Theory.pdf},
  eprinttype = {googlebooks}
}

@article{gehring_convolutional_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.03122},
  primaryClass = {cs},
  title = {Convolutional {{Sequence}} to {{Sequence Learning}}},
  url = {http://arxiv.org/abs/1705.03122},
  abstract = {The prevalent approach to sequence to sequence learning maps an input sequence to a variable length output sequence via recurrent neural networks. We introduce an architecture based entirely on convolutional neural networks. Compared to recurrent models, computations over all elements can be fully parallelized during training and optimization is easier since the number of non-linearities is fixed and independent of the input length. Our use of gated linear units eases gradient propagation and we equip each decoder layer with a separate attention module. We outperform the accuracy of the deep LSTM setup of Wu et al. (2016) on both WMT'14 English-German and WMT'14 English-French translation at an order of magnitude faster speed, both on GPU and CPU.},
  date = {2017-05-08},
  keywords = {Computer Science - Computation and Language},
  author = {Gehring, Jonas and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Gehring et al_2017_Convolutional Sequence to Sequence Learning.pdf;/Users/fergalcotter/Zotero/storage/5ZUDSS6F/1705.html}
}

@online{_two-dimensional_????,
  title = {Two-{{Dimensional Wavelets}} and Their {{Relatives}} - {{Jean}}-{{Pierre Antoine}}, {{Romain Murenzi}}, {{Pierre Vandergheynst}}, {{Syed Twareque Ali}} - {{Google Books}}},
  url = {https://books.google.co.uk/books/about/Two_Dimensional_Wavelets_and_their_Relat.html?id=rKuTb4UHDA0C&redir_esc=y},
  urldate = {2017-09-04}
}

@book{murphy_machine_2012,
  title = {Machine {{Learning}}: {{A Probabilistic Perspective}}},
  isbn = {978-0-262-01802-9},
  shorttitle = {Machine {{Learning}}},
  abstract = {Today's Web-enabled deluge of electronic data calls for automated methods of data analysis. Machine learning provides these, developing methods that can automatically detect patterns in data and then use the uncovered patterns to predict future data. This textbook offers a comprehensive and self-contained introduction to the field of machine learning, based on a unified, probabilistic approach. The coverage combines breadth and depth, offering necessary background material on such topics as probability, optimization, and linear algebra as well as discussion of recent developments in the field, including conditional random fields, L1 regularization, and deep learning. The book is written in an informal, accessible style, complete with pseudo-code for the most important algorithms. All topics are copiously illustrated with color images and worked examples drawn from such application domains as biology, text processing, computer vision, and robotics. Rather than providing a cookbook of different heuristic methods, the book stresses a principled model-based approach, often using the language of graphical models to specify models in a concise and intuitive way. Almost all the models described have been implemented in a MATLAB software package--PMTK (probabilistic modeling toolkit)--that is freely available online. The book is suitable for upper-level undergraduates with an introductory-level college math background and beginning graduate students.},
  publisher = {{The MIT Press}},
  date = {2012},
  author = {Murphy, Kevin P.},
  file = {/Users/fergalcotter/Dropbox/Papers/Murphy_2012_Machine Learning.pdf}
}

@online{_schaums_????,
  title = {Schaum's {{Outline}} of {{Complex Variables}}, 2ed},
  url = {https://www.mhprofessional.com/9780071615693-usa-schaums-outline-of-complex-variables-2ed-group},
  abstract = {The guide that helps students study faster, learn better, and get top grades

More than 40 million students have trusted Schaum's to help them study faster, learn better, and get top grades. Now Schaum's is better than ever-with a new look, a new format with hundreds of practice problems, and completely updated information to conform to the latest developments in every field of study.

Fully compatible with your classroom text, Schaum's highlights all the important facts you need to know. Use Schaum's to shorten your study time-and get your best test scores!

Schaum's Outlines-Problem Solved.},
  journaltitle = {McGraw-Hill Education},
  urldate = {2017-09-04},
  file = {/Users/fergalcotter/Zotero/storage/VICI2CEP/9780071615693-usa-schaums-outline-of-complex-variables-2ed-group.html}
}

@book{starck_sparse_2010,
  location = {{New York, NY, USA}},
  title = {Sparse {{Image}} and {{Signal Processing}}: {{Wavelets}}, {{Curvelets}}, {{Morphological Diversity}}},
  isbn = {978-0-521-11913-9},
  shorttitle = {Sparse {{Image}} and {{Signal Processing}}},
  abstract = {This book presents the state of the art in sparse and multiscale image and signal processing, covering linear multiscale transforms, such as wavelet, ridgelet, or curvelet transforms, and non-linear multiscale transforms based on the median and mathematical morphology operators. Recent concepts of sparsity and morphological diversity are described and exploited for various problems such as denoising, inverse problem regularization, sparse signal decomposition, blind source separation, and compressed sensing. This book weds theory and practice in examining applications in areas such as astronomy, biology, physics, digital media, and forensics. A final chapter explores a paradigm shift in signal processing, showing that previous limits to information sampling and extraction can be overcome in very significant ways. Matlab and IDL code accompany these methods and applications to reproduce the experiments and illustrate the reasoning and methodology of the research available for download at the associated Web site.},
  publisher = {{Cambridge University Press}},
  date = {2010},
  author = {Starck, Jean-Luc and Murtagh, Fionn and Fadili, Jalal},
  file = {/Users/fergalcotter/Dropbox/Papers/Starck et al_2010_Sparse Image and Signal Processing.pdf}
}

@book{jensen_ripples_2001,
  langid = {english},
  title = {Ripples in {{Mathematics}}: {{The Discrete Wavelet Transform}}},
  isbn = {978-3-540-41662-3},
  shorttitle = {Ripples in {{Mathematics}}},
  abstract = {Yet another book on wavelets. There are many books on wavelets available, written for readers with different backgrounds. But the topic is becoming ever more important in mainstream signal processing, since the new JPEG2000 standard is based on wavelet techniques. Wavelet techniques are also impor tant in the MPEG-4 standard. So we thought that there might be room for yet another book on wavelets. This one is limited in scope, since it only covers the discrete wavelet trans form, which is central in modern digital signal processing. The presentation is based on the lifting technique discovered by W. Sweldens in 1994. Due to a result by I. Daubechies and W. Sweldens from 1996 this approach covers the same class of discrete wavelet transforms as the one based on two channel filter banks with perfect reconstruction. The goal of this book is to enable readers, with modest backgrounds in mathematics, signal analysis, and programming, to understand wavelet based techniques in signal analysis, and perhaps to enable them to apply such methods to real world problems. The book started as a set of lecture notes, written in Danish, for a group of teachers of signal analysis at Danish Engineering Colleges. The material has also been presented to groups of engineers working in industry, and used in mathematics courses at Aalborg University.},
  pagetotal = {262},
  publisher = {{Springer Science \& Business Media}},
  date = {2001-06-06},
  keywords = {Computers / Computer Graphics,Computers / Intelligence (AI) & Semantics,Computers / Optical Data Processing,Mathematics / Algebra / General,Mathematics / Algebra / Linear,Mathematics / Applied,Mathematics / Calculus,Mathematics / Mathematical Analysis,Science / Physics / Mathematical & Computational,Technology & Engineering / General},
  author = {Jensen, A. and la Cour-Harbo, Anders},
  file = {/Users/fergalcotter/Dropbox/Papers/Jensen_Cour-Harbo_2001_Ripples in Mathematics.pdf},
  eprinttype = {googlebooks}
}

@book{mackay_information_2002,
  location = {{New York, NY, USA}},
  title = {Information {{Theory}}, {{Inference}} \& {{Learning Algorithms}}},
  isbn = {978-0-521-64298-9},
  publisher = {{Cambridge University Press}},
  date = {2002},
  author = {MacKay, David J. C.},
  file = {/Users/fergalcotter/Dropbox/Papers/MacKay_2002_Information Theory, Inference & Learning Algorithms.pdf}
}

@online{_sparse_????-1,
  title = {Sparse {{Image}} and {{Signal Processing}}},
  url = {http://www.multiresolutions.com/sparsesignalrecipes/},
  urldate = {2017-09-04}
}

@book{taubman_jpeg_2001,
  location = {{Norwell, MA, USA}},
  title = {{{JPEG}} 2000: {{Image Compression Fundamentals}}, {{Standards}} and {{Practice}}},
  isbn = {978-0-7923-7519-7},
  shorttitle = {{{JPEG}} 2000},
  publisher = {{Kluwer Academic Publishers}},
  date = {2001},
  author = {Taubman, David S. and Marcellin, Michael W.},
  file = {/Users/fergalcotter/Dropbox/Papers/Taubman_Marcellin_2001_JPEG 2000.pdf}
}

@book{rasmussen_gaussian_2006,
  title = {Gaussian {{Processes}} for {{Machine Learning}}},
  url = {http://www.kyb.mpg.de/nc/de/employee/details/carl.html?tx_sevenpack_pi1%5Bshow_abstracts%5D=0&tx_sevenpack_pi1%5Bshow_keywords%5D=0&tx_sevenpack_pi1%5Bexport%5D=bibtex},
  publisher = {{the MIT Press}},
  date = {2006},
  author = {Rasmussen, C. E. and Williams, C. K. I.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rasmussen_Williams_2006_Gaussian Processes for Machine Learning.pdf}
}

@book{bishop_pattern_2006,
  location = {{Secaucus, NJ, USA}},
  title = {Pattern {{Recognition}} and {{Machine Learning}} ({{Information Science}} and {{Statistics}})},
  isbn = {978-0-387-31073-2},
  publisher = {{Springer-Verlag New York, Inc.}},
  date = {2006},
  author = {Bishop, Christopher M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Bishop_2006_Pattern Recognition and Machine Learning (Information Science and Statistics).pdf;/Users/fergalcotter/Dropbox/Papers/Books/Bishop solutions.pdf;/Users/fergalcotter/Dropbox/Papers/Books/Bishop solutions.pdf}
}

@book{antoine_two-dimensional_2004,
  title = {Two-{{Dimensional Wavelets}} and Their {{Relatives}}},
  isbn = {978-0-511-54339-5},
  url = {https://infoscience.epfl.ch/record/87060},
  publisher = {{Cambridge University Press}},
  urldate = {2017-09-04},
  date = {2004},
  author = {Antoine, J. and Murenzi, R. and Vandergheynst, P. and Ali, S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Antoine et al_2004_Two-Dimensional Wavelets and their Relatives.pdf;/Users/fergalcotter/Zotero/storage/B45RPUW4/87060.html}
}

@book{winkler_image_2006,
  location = {{Secaucus, NJ, USA}},
  title = {Image {{Analysis}}, {{Random Fields}} and {{Markov Chain Monte Carlo Methods}}: {{A Mathematical Introduction}} ({{Stochastic Modelling}} and {{Applied Probability}})},
  isbn = {978-3-540-44213-4},
  shorttitle = {Image {{Analysis}}, {{Random Fields}} and {{Markov Chain Monte Carlo Methods}}},
  publisher = {{Springer-Verlag New York, Inc.}},
  date = {2006},
  author = {Winkler, Gerhard},
  file = {/Users/fergalcotter/Dropbox/Papers/Winkler_2006_Image Analysis, Random Fields and Markov Chain Monte Carlo Methods.pdf}
}

@online{_sparse_????,
  title = {Sparse {{Image}} and {{Signal Processing}}: {{Wavelets}}, {{Curvelets}}, {{Morphological Diversity}}: {{Amazon}}.Co.Uk: {{Jean}}-{{Luc Starck}}, {{Fionn Murtagh}}, {{Jalal M}}. {{Fadili}}: 9780521119139: {{Books}}},
  url = {https://www.amazon.co.uk/Sparse-Image-Signal-Processing-Morphological/dp/0521119138},
  urldate = {2017-09-04}
}

@inproceedings{abadi_deep_2016,
  location = {{New York, NY, USA}},
  title = {Deep {{Learning}} with {{Differential Privacy}}},
  isbn = {978-1-4503-4139-4},
  url = {http://doi.acm.org/10.1145/2976749.2978318},
  doi = {10.1145/2976749.2978318},
  abstract = {Machine learning techniques based on neural networks are achieving remarkable results in a wide variety of domains. Often, the training of models requires large, representative datasets, which may be crowdsourced and contain sensitive information. The models should not expose private information in these datasets. Addressing this goal, we develop new algorithmic techniques for learning and a refined analysis of privacy costs within the framework of differential privacy. Our implementation and experiments demonstrate that we can train deep neural networks with non-convex objectives, under a modest privacy budget, and at a manageable cost in software complexity, training efficiency, and model quality.},
  booktitle = {Proceedings of the 2016 {{ACM SIGSAC Conference}} on {{Computer}} and {{Communications Security}}},
  series = {{{CCS}} '16},
  publisher = {{ACM}},
  date = {2016},
  pages = {308--318},
  keywords = {deep learning,differential privacy},
  author = {Abadi, Martin and Chu, Andy and Goodfellow, Ian and McMahan, H. Brendan and Mironov, Ilya and Talwar, Kunal and Zhang, Li},
  file = {/Users/fergalcotter/Dropbox/Papers/Abadi et al_2016_Deep Learning with Differential Privacy.pdf}
}

@online{_deep_????,
  title = {Deep {{Learning}}},
  url = {http://www.deeplearningbook.org/},
  urldate = {2017-09-04}
}

@book{spiegel_schaums_2009,
  langid = {english},
  location = {{New York}},
  title = {Schaum's {{Outline}} of {{Complex Variables}}, 2ed},
  edition = {2 edition},
  isbn = {978-0-07-161569-3},
  abstract = {The guide that helps students study faster, learn better, and get top gradesMore than 40 million students have trusted Schaum's to help them study faster, learn better, and get top grades. Now Schaum's is better than ever-with a new look, a new format with hundreds of practice problems, and completely updated information to conform to the latest developments in every field of study.Fully compatible with your classroom text, Schaum's highlights all the important facts you need to know. Use Schaum's to shorten your study time-and get your best test scores!Schaum's Outlines-Problem Solved.},
  pagetotal = {384},
  publisher = {{McGraw-Hill Education}},
  date = {2009-07-16},
  author = {Spiegel, Murray R. and Lipschutz, Seymour and Schiller, John J. and Spellman, Dennis},
  file = {/Users/fergalcotter/Dropbox/Papers/Spiegel et al_2009_Schaum's Outline of Complex Variables, 2ed.pdf}
}

@online{_reinforcement_????,
  title = {Reinforcement {{Learning}}},
  url = {https://mitpress.mit.edu/books/reinforcement-learning},
  abstract = {Richard Sutton and Andrew Barto provide a clear and simple account of the key ideas and algorithms of reinforcement learning. Their discussion ranges from the history of the field's intellectual foundations to the most recent developments and applications.},
  journaltitle = {MIT Press},
  urldate = {2017-09-04},
  file = {/Users/fergalcotter/Zotero/storage/ZIF2CJGA/reinforcement-learning.html}
}

@book{richard_sutton_reinforcement_????,
  title = {Reinforcement {{Learning}}: {{An Introduction}}},
  isbn = {978-0-262-30384-2},
  url = {http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf},
  abstract = {Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives when interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the key ideas and algorithms of reinforcement learning. Their discussion ranges from the history of the field's intellectual foundations to the most recent developments and applications. The only necessary mathematical background is familiarity with elementary concepts of probability.The book is divided into three parts. Part I defines the reinforcement learning problem in terms of Markov decision processes. Part II provides basic solution methods: dynamic programming, Monte Carlo methods, and temporal-difference learning. Part III presents a unified view of the solution methods and incorporates artificial neural networks, eligibility traces, and planning; the two final chapters present case studies and consider the future of reinforcement learning.},
  publisher = {{MIT Press}},
  urldate = {2017-09-04},
  author = {{Richard Sutton} and {Andrew Barto}},
  file = {/Users/fergalcotter/Dropbox/Papers/Richard Sutton_Andrew Barto_Reinforcement Learning.pdf}
}

@book{montavon_neural_2012,
  title = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
  edition = {2nd},
  isbn = {978-3-642-35288-1},
  shorttitle = {Neural {{Networks}}},
  abstract = {The twenty last years have been marked by an increase in available data and computing power. In parallel to this trend, the focus of neural network research and the practice of training neural networks has undergone a number of important changes, for example, use of deep learning machines. The second edition of the book augments the first edition with more tricks, which have resulted from 14 years of theory and experimentation by some of the world's most prominent neural network researchers. These tricks can make a substantial difference (in terms of speed, ease of implementation, and accuracy) when it comes to putting algorithms to work on real problems.},
  publisher = {{Springer Publishing Company, Incorporated}},
  date = {2012},
  author = {Montavon, Grgoire and Orr, Genevive and Mller, Klaus-Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Montavon et al_2012_Neural Networks.pdf}
}

@online{_machine_????,
  title = {Machine {{Learning}}},
  url = {https://mitpress.mit.edu/books/machine-learning-0},
  abstract = {A comprehensive introduction to machine learning that uses probabilistic models and inference as a unifying approach.},
  journaltitle = {MIT Press},
  urldate = {2017-09-04},
  file = {/Users/fergalcotter/Zotero/storage/HDMJGS2Z/machine-learning-0.html}
}

@book{nick_mcclure_tensorflow_2017,
  title = {{{TensorFlow Machine Learning Cookbook}}},
  isbn = {978-1-78646-216-9},
  url = {https://www.packtpub.com/big-data-and-business-intelligence/tensorflow-machine-learning-cookbook},
  abstract = {TensorFlow is an open source software library for Machine Intelligence. The independent recipes in this book will teach you how to use TensorFlow for complex data computations and will let you dig deeper and gain more insights into your data than ever before. You’ll work through recipes on training models, model evaluation, sentiment analysis, regression analysis, clustering analysis, artificial neural networks, and deep learning – each using Google’s machine learning library TensorFlow.

This guide starts with the fundamentals of the TensorFlow library which includes variables, matrices, and various data sources. Moving ahead, you will get hands-on experience with Linear Regression techniques with TensorFlow. The next chapters cover important high-level concepts such as neural networks, CNN, RNN, and NLP.

Once you are familiar and comfortable with the TensorFlow ecosystem, the last chapter will show you how to take it to production.},
  pagetotal = {370},
  publisher = {{Packt}},
  urldate = {2017-09-12},
  date = {2017-02},
  author = {{Nick McClure}},
  file = {/Users/fergalcotter/Dropbox/Papers/Nick McClure_2017_TensorFlow Machine Learning Cookbook.pdf}
}

@incollection{gomez_reversible_2017,
title = {The Reversible Residual Network: Backpropagation Without Storing Activations},
author = {Gomez, Aidan N and Ren, Mengye and Urtasun, Raquel and Grosse, Roger B},
booktitle = {Advances in Neural Information Processing Systems 30},
editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
pages = {2214--2224},
year = {2017},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/6816-the-reversible-residual-network-backpropagation-without-storing-activations.pdf}
}

@inproceedings{cotter_visualizing_2017,
  archivePrefix = {arXiv},
  % eprinttype = {arxiv},
  % eprint = {1709.01355},
  location = {{Tokyo, Japan}},
  title = {Visualizing and {{Improving Scattering Networks}}},
  doi = {10.1109/MLSP.2017.8168136},
  abstract = {Scattering Transforms (or ScatterNets) introduced by Mallat are a promising start into creating a well-defined feature extractor to use for pattern recognition and image classification tasks. They are of particular interest due to their architectural similarity to Convolutional Neural Networks (CNNs), while requiring no parameter learning and still performing very well (particularly in constrained classification tasks). In this paper we visualize what the deeper layers of a ScatterNet are sensitive to using a 'DeScatterNet'. We show that the higher orders of ScatterNets are sensitive to complex, edge-like patterns (checker-boards and rippled edges). These complex patterns may be useful for texture classification, but are quite dissimilar from the patterns visualized in second and third layers of Convolutional Neural Networks (CNNs) - the current state of the art Image Classifiers. We propose that this may be the source of the current gaps in performance between ScatterNets and CNNs (83\% vs 93\% on CIFAR-10 for ScatterNet+SVM vs ResNet). We then use these visualization tools to propose possible enhancements to the ScatterNet design, which show they have the power to extract features more closely resembling CNNs, while still being well-defined and having the invariance properties fundamental to ScatterNets.},
  eventtitle = {2017 {{IEEE International Workshop}} on {{Machine Learning}} for {{Signal Processing}} ({{MLSP}})},
  booktitle = {Proceedings of 2017 {{IEEE International Workshop}} on {{Machine Learning}} for {{Signal Processing}} ({{MLSP}})},
  publisher = {{IEEE}},
  date = {2017-09-05},
  keywords = {architectural similarity,art image classifiers,complex patterns,Computer Science - Computer Vision and Pattern Recognition,constrained classification tasks,Convolutional Neural Network,convolutional neural networks,DeScatterNets,DTCWT,feature extraction,Feature extraction,feature extractor,image classification,image classification tasks,Image reconstruction,image texture,learning (artificial intelligence),neural nets,parameter learning,pattern classification,pattern recognition,Personal area networks,ResNet,Scattering,Scattering Network,scattering networks,scattering transforms,ScatterNet design,ScatterNets,support vector machines,texture classification,Visualization,visualization tools,Wavelet transforms},
  author = {Cotter, Fergal and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Cotter_Kingsbury_2017_Visualizing and Improving Scattering Networks.pdf;/Users/fergalcotter/Dropbox/Papers/Cotter_Kingsbury_2017_Visualizing and improving scattering networks2.pdf;/Users/fergalcotter/Zotero/storage/R6ZVM5CK/1709.html;/Users/fergalcotter/Zotero/storage/ZIVJIADY/8168136.html}
}

@article{mathieu_fast_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1312.5851},
  primaryClass = {cs},
  title = {Fast {{Training}} of {{Convolutional Networks}} through {{FFTs}}},
  url = {http://arxiv.org/abs/1312.5851},
  abstract = {Convolutional networks are one of the most widely employed architectures in computer vision and machine learning. In order to leverage their ability to learn complex functions, large amounts of data are required for training. Training a large convolutional network to produce state-of-the-art results can take weeks, even when using modern GPUs. Producing labels using a trained network can also be costly when dealing with web-scale datasets. In this work, we present a simple algorithm which accelerates training and inference by a significant factor, and can yield improvements of over an order of magnitude compared to existing state-of-the-art implementations. This is done by computing convolutions as pointwise products in the Fourier domain while reusing the same transformed feature map many times. The algorithm is implemented on a GPU architecture and addresses a number of related challenges.},
  date = {2013-12-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Mathieu, Michael and Henaff, Mikael and LeCun, Yann},
  file = {/Users/fergalcotter/Dropbox/Papers/Mathieu et al_2013_Fast Training of Convolutional Networks through FFTs.pdf;/Users/fergalcotter/Zotero/storage/TJTJIS3E/1312.html}
}

@article{highlander_very_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1601.06815},
  primaryClass = {cs},
  title = {Very {{Efficient Training}} of {{Convolutional Neural Networks}} Using {{Fast Fourier Transform}} and {{Overlap}}-and-{{Add}}},
  url = {http://arxiv.org/abs/1601.06815},
  abstract = {Convolutional neural networks (CNNs) are currently state-of-the-art for various classification tasks, but are computationally expensive. Propagating through the convolutional layers is very slow, as each kernel in each layer must sequentially calculate many dot products for a single forward and backward propagation which equates to \$\textbackslash{}mathcal\{O\}(N\^\{2\}n\^\{2\})\$ per kernel per layer where the inputs are \$N \textbackslash{}times N\$ arrays and the kernels are \$n \textbackslash{}times n\$ arrays. Convolution can be efficiently performed as a Hadamard product in the frequency domain. The bottleneck is the transformation which has a cost of \$\textbackslash{}mathcal\{O\}(N\^\{2\}\textbackslash{}log\_2 N)\$ using the fast Fourier transform (FFT). However, the increase in efficiency is less significant when \$N\textbackslash{}gg n\$ as is the case in CNNs. We mitigate this by using the "overlap-and-add" technique reducing the computational complexity to \$\textbackslash{}mathcal\{O\}(N\^2\textbackslash{}log\_2 n)\$ per kernel. This method increases the algorithm's efficiency in both the forward and backward propagation, reducing the training and testing time for CNNs. Our empirical results show our method reduces computational time by a factor of up to 16.3 times the traditional convolution implementation for a 8 \$\textbackslash{}times\$ 8 kernel and a 224 \$\textbackslash{}times\$ 224 image.},
  date = {2016-01-25},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Highlander, Tyler and Rodriguez, Andres},
  file = {/Users/fergalcotter/Dropbox/Papers/Highlander_Rodriguez_2016_Very Efficient Training of Convolutional Neural Networks using Fast Fourier.pdf;/Users/fergalcotter/Zotero/storage/74B4JN5E/1601.html}
}

@article{vasilache_fast_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1412.7580},
  primaryClass = {cs},
  title = {Fast {{Convolutional Nets With}} Fbfft: {{A GPU Performance Evaluation}}},
  url = {http://arxiv.org/abs/1412.7580},
  shorttitle = {Fast {{Convolutional Nets With}} Fbfft},
  abstract = {We examine the performance profile of Convolutional Neural Network training on the current generation of NVIDIA Graphics Processing Units. We introduce two new Fast Fourier Transform convolution implementations: one based on NVIDIA's cuFFT library, and another based on a Facebook authored FFT implementation, fbfft, that provides significant speedups over cuFFT (over 1.5x) for whole CNNs. Both of these convolution implementations are available in open source, and are faster than NVIDIA's cuDNN implementation for many common convolutional layers (up to 23.5x for some synthetic kernel configurations). We discuss different performance regimes of convolutions, comparing areas where straightforward time domain convolutions outperform Fourier frequency domain convolutions. Details on algorithmic applications of NVIDIA GPU hardware specifics in the implementation of fbfft are also provided.},
  date = {2014-12-23},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Computer Science - Distributed; Parallel; and Cluster Computing},
  author = {Vasilache, Nicolas and Johnson, Jeff and Mathieu, Michael and Chintala, Soumith and Piantino, Serkan and LeCun, Yann},
  file = {/Users/fergalcotter/Dropbox/Papers/Vasilache et al_2014_Fast Convolutional Nets With fbfft.pdf;/Users/fergalcotter/Zotero/storage/H5FVBN98/1412.html}
}

@article{trabelsi_deep_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.09792},
  primaryClass = {cs},
  title = {Deep {{Complex Networks}}},
  url = {http://arxiv.org/abs/1705.09792},
  abstract = {At present, the vast majority of building blocks, techniques, and architectures for deep learning are based on real-valued operations and representations. However, recent work on recurrent neural networks and older fundamental theoretical analysis suggests that complex numbers could have a richer representational capacity and could also facilitate noise-robust memory retrieval mechanisms. Despite their attractive properties and potential for opening up entirely new neural architectures, complex-valued deep neural networks have been marginalized due to the absence of the building blocks required to design such models. In this work, we provide the key atomic components for complex-valued deep neural networks and apply them to convolutional feed-forward networks. More precisely, we rely on complex convolutions and present algorithms for complex batch-normalization, complex weight initialization strategies for complex-valued neural nets and we use them in experiments with end-to-end training schemes. We demonstrate that such complex-valued models are able to achieve comparable or better performance than their real-valued counterparts. We test deep complex models on several computer vision tasks and on music transcription using the MusicNet dataset where we achieve state of the art performance.},
  date = {2017-05-27},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Trabelsi, Chiheb and Bilaniuk, Olexa and Serdyuk, Dmitriy and Subramanian, Sandeep and Santos, João Felipe and Mehri, Soroush and Rostamzadeh, Negar and Bengio, Yoshua and Pal, Christopher J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Trabelsi et al_2017_Deep Complex Networks.pdf;/Users/fergalcotter/Zotero/storage/UPS4BB94/1705.html}
}

@article{fujieda_wavelet_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1707.07394},
  primaryClass = {cs},
  title = {Wavelet {{Convolutional Neural Networks}} for {{Texture Classification}}},
  url = {http://arxiv.org/abs/1707.07394},
  abstract = {Texture classification is an important and challenging problem in many image processing applications. While convolutional neural networks (CNNs) achieved significant successes for image classification, texture classification remains a difficult problem since textures usually do not contain enough information regarding the shape of object. In image processing, texture classification has been traditionally studied well with spectral analyses which exploit repeated structures in many textures. Since CNNs process images as-is in the spatial domain whereas spectral analyses process images in the frequency domain, these models have different characteristics in terms of performance. We propose a novel CNN architecture, wavelet CNNs, which integrates a spectral analysis into CNNs. Our insight is that the pooling layer and the convolution layer can be viewed as a limited form of a spectral analysis. Based on this insight, we generalize both layers to perform a spectral analysis with wavelet transform. Wavelet CNNs allow us to utilize spectral information which is lost in conventional CNNs but useful in texture classification. The experiments demonstrate that our model achieves better accuracy in texture classification than existing models. We also show that our model has significantly fewer parameters than CNNs, making our model easier to train with less memory.},
  date = {2017-07-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning},
  author = {Fujieda, Shin and Takayama, Kohei and Hachisuka, Toshiya},
  file = {/Users/fergalcotter/Dropbox/Papers/Fujieda et al_2017_Wavelet Convolutional Neural Networks for Texture Classification.pdf;/Users/fergalcotter/Zotero/storage/IV3K5KEG/1707.html}
}

@inproceedings{sun_design_2016,
  langid = {english},
  title = {Design of {{Kernels}} in {{Convolutional Neural Networks}} for {{Image Classification}}},
  isbn = {978-3-319-46477-0 978-3-319-46478-7},
  url = {https://link.springer.com/chapter/10.1007/978-3-319-46478-7_4},
  doi = {10.1007/978-3-319-46478-7_4},
  abstract = {Despite the effectiveness of convolutional neural networks (CNNs) for image classification, our understanding of the effect of shape of convolution kernels on learned representations is limited. In this work, we explore and employ the relationship between shape of kernels which define receptive fields (RFs) in CNNs for learning of feature representations and image classification. For this purpose, we present a feature visualization method for visualization of pixel-wise classification score maps of learned features. Motivated by our experimental results, and observations reported in the literature for modeling of visual systems, we propose a novel design of shape of kernels for learning of representations in CNNs.In the experimental results, the proposed models also outperform the state-of-the-art methods employed on the CIFAR-10/100 datasets [1] for image classification. We also achieved an outstanding performance in the classification task, comparing to a base CNN model that introduces more parameters and computational time, using the ILSVRC-2012 dataset [2]. Additionally, we examined the region of interest (ROI) of different models in the classification task and analyzed the robustness of the proposed method to occluded images. Our results indicate the effectiveness of the proposed approach.},
  eventtitle = {European {{Conference}} on {{Computer Vision}}},
  booktitle = {Computer {{Vision}} – {{ECCV}} 2016},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer, Cham}},
  urldate = {2017-10-12},
  date = {2016-10-08},
  pages = {51-66},
  author = {Sun, Zhun and Ozay, Mete and Okatani, Takayuki},
  file = {/Users/fergalcotter/Dropbox/Papers/Sun et al_2016_Design of Kernels in Convolutional Neural Networks for Image Classification.pdf;/Users/fergalcotter/Zotero/storage/MFINQPXZ/978-3-319-46478-7_4.html}
}

@article{jacobsen_dynamic_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1706.00598},
  primaryClass = {cs, stat},
  title = {Dynamic {{Steerable Blocks}} in {{Deep Residual Networks}}},
  url = {http://arxiv.org/abs/1706.00598},
  abstract = {Filters in convolutional networks are typically parameterized in a pixel basis, that does not take prior knowledge about the visual world into account. We investigate the generalized notion of frames designed with image properties in mind, as alternatives to this parametrization. We show that frame-based ResNets and Densenets can improve performance on Cifar-10+ consistently, while having additional pleasant properties like steerability. By exploiting these transformation properties explicitly, we arrive at dynamic steerable blocks. They are an extension of residual blocks, that are able to seamlessly transform filters under pre-defined transformations, conditioned on the input at training and inference time. Dynamic steerable blocks learn the degree of invariance from data and locally adapt filters, allowing them to apply a different geometrical variant of the same filter to each location of the feature map. When evaluated on the Berkeley Segmentation contour detection dataset, our approach outperforms all competing approaches that do not utilize pre-training. Our results highlight the benefits of image-based regularization to deep networks.},
  date = {2017-06-02},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Statistics - Machine Learning},
  author = {Jacobsen, Jörn-Henrik and de Brabandere, Bert and Smeulders, Arnold W. M.},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Dropbox/Papers/Jacobsen et al_2017_Dynamic Steerable Blocks in Deep Residual Networks.pdf;/Users/fergalcotter/Zotero/storage/QDC4QZ9R/1706.html}
}

@article{chen_compressing_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1506.04449},
  primaryClass = {cs},
  title = {Compressing {{Convolutional Neural Networks}}},
  url = {http://arxiv.org/abs/1506.04449},
  abstract = {Convolutional neural networks (CNN) are increasingly used in many areas of computer vision. They are particularly attractive because of their ability to "absorb" great quantities of labeled data through millions of parameters. However, as model sizes increase, so do the storage and memory requirements of the classifiers. We present a novel network architecture, Frequency-Sensitive Hashed Nets (FreshNets), which exploits inherent redundancy in both convolutional layers and fully-connected layers of a deep learning model, leading to dramatic savings in memory and storage consumption. Based on the key observation that the weights of learned convolutional filters are typically smooth and low-frequency, we first convert filter weights to the frequency domain with a discrete cosine transform (DCT) and use a low-cost hash function to randomly group frequency parameters into hash buckets. All parameters assigned the same hash bucket share a single value learned with standard back-propagation. To further reduce model size we allocate fewer hash buckets to high-frequency components, which are generally less important. We evaluate FreshNets on eight data sets, and show that it leads to drastically better compressed performance than several relevant baselines.},
  date = {2015-06-14},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Chen, Wenlin and Wilson, James T. and Tyree, Stephen and Weinberger, Kilian Q. and Chen, Yixin},
  file = {/Users/fergalcotter/Dropbox/Papers/Chen et al_2015_Compressing Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/4UIZUU29/1506.html}
}

@inproceedings{chen_compressing_2016,
  location = {{New York, NY, USA}},
  title = {Compressing {{Convolutional Neural Networks}} in the {{Frequency Domain}}},
  isbn = {978-1-4503-4232-2},
  url = {http://doi.acm.org/10.1145/2939672.2939839},
  doi = {10.1145/2939672.2939839},
  abstract = {Convolutional neural networks (CNN) are increasingly used in many areas of computer vision. They are particularly attractive because of their ability to "absorb" great quantities of labeled data through millions of parameters. However, as model sizes increase, so do the storage and memory requirements of the classifiers, hindering many applications such as image and speech recognition on mobile phones and other devices. In this paper, we present a novel net- work architecture, Frequency-Sensitive Hashed Nets (FreshNets), which exploits inherent redundancy in both convolutional layers and fully-connected layers of a deep learning model, leading to dramatic savings in memory and storage consumption. Based on the key observation that the weights of learned convolutional filters are typically smooth and low-frequency, we first convert filter weights to the frequency domain with a discrete cosine transform (DCT) and use a low-cost hash function to randomly group frequency parameters into hash buckets. All parameters assigned the same hash bucket share a single value learned with standard back-propagation. To further reduce model size, we allocate fewer hash buckets to high-frequency components, which are generally less important. We evaluate FreshNets on eight data sets, and show that it leads to better compressed performance than several relevant baselines.},
  booktitle = {Proceedings of the {{22Nd ACM SIGKDD International Conference}} on {{Knowledge Discovery}} and {{Data Mining}}},
  series = {{{KDD}} '16},
  publisher = {{ACM}},
  date = {2016},
  pages = {1475--1484},
  keywords = {convolutional neural networks,hashing,model compression},
  author = {Chen, Wenlin and Wilson, James and Tyree, Stephen and Weinberger, Kilian Q. and Chen, Yixin},
  file = {/Users/fergalcotter/Dropbox/Papers/Chen et al_2016_Compressing Convolutional Neural Networks in the Frequency Domain.pdf}
}

@inproceedings{lu_hierarchical_2014,
  title = {Hierarchical Image Representation via Multi-Level Sparse Coding},
  doi = {10.1109/ICIP.2014.7025993},
  abstract = {This paper presents a hierarchical model for robust image representation. We first introduce multi-level sparse coding algorithm and normalized max pooling strategy which are designed to obtain meaningful sparse codes and robust pooled codes, respectively. With the sparse codes and pooled codes, a hierarchical architecture is built and more robust features are extracted at the second layer. The proposed method has been evaluated on two widely used datasets: Caltech-101 and Caltech-256, and experimental results demonstrate that the proposed method is both effective and robust in image representation compared with the state-of-the-art.},
  eventtitle = {2014 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  booktitle = {2014 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  date = {2014-10},
  pages = {4902-4906},
  keywords = {Caltech-101,Robustness,Dictionaries,Caltech-256,Encoding,Feature extraction,Image coding,Image representation,PSNR,hierarchical,hierarchical image representation,multi-level sparse coding,multilevel sparse coding,normalized max pooling,normalized max pooling strategy,robust image representation,robust pooled codes},
  author = {Lu, K. and Li, J. and An, X. and He, H.},
  file = {/Users/fergalcotter/Zotero/storage/QQELJF4M/Lu et al_2014_Hierarchical image representation via multi-level sparse coding.pdf;/Users/fergalcotter/Zotero/storage/CM9VHJAM/7025993.html}
}

@article{mallat_understanding_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1601.04920},
  title = {Understanding {{Deep Convolutional Networks}}},
  volume = {374},
  issn = {1364-503X, 1471-2962},
  url = {http://arxiv.org/abs/1601.04920},
  doi = {10.1098/rsta.2015.0203},
  abstract = {Deep convolutional networks provide state of the art classifications and regressions results over many high-dimensional problems. We review their architecture, which scatters data with a cascade of linear filter weights and non-linearities. A mathematical framework is introduced to analyze their properties. Computations of invariants involve multiscale contractions, the linearization of hierarchical symmetries, and sparse separations. Applications are discussed.},
  number = {2065},
  journaltitle = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences},
  date = {2016-04-13},
  pages = {20150203},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Statistics - Machine Learning},
  author = {Mallat, Stéphane},
  file = {/Users/fergalcotter/Dropbox/Papers/Mallat_2016_Understanding Deep Convolutional Networks_2.pdf;/Users/fergalcotter/Zotero/storage/SSPIWG2K/1601.html}
}

@article{wiatowski_energy_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1704.03636},
  primaryClass = {cs, math, stat},
  title = {Energy {{Propagation}} in {{Deep Convolutional Neural Networks}}},
  url = {http://arxiv.org/abs/1704.03636},
  abstract = {Many practical machine learning tasks employ very deep convolutional neural networks. Such large depths pose formidable computational challenges in training and operating the network. It is therefore important to understand how fast the energy contained in the propagated signals (a.k.a. feature maps) decays across layers. In addition, it is desirable that the feature extractor generated by the network be informative in the sense of the only signal mapping to the all-zeros feature vector being the zero input signal. This "trivial null-space" property can be accomplished by asking for "energy conservation" in the sense of the energy in the feature vector being proportional to that of the corresponding input signal. This paper establishes conditions for energy conservation (and thus for a trivial null-space) for a wide class of deep convolutional neural network-based feature extractors and characterizes corresponding feature map energy decay rates. Specifically, we consider general scattering networks employing the modulus non-linearity and we find that under mild analyticity and high-pass conditions on the filters (which encompass, inter alia, various constructions of Weyl-Heisenberg filters, wavelets, ridgelets, (\$\textbackslash{}alpha\$)-curvelets, and shearlets) the feature map energy decays at least polynomially fast. For broad families of wavelets and Weyl-Heisenberg filters, the guaranteed decay rate is shown to be exponential. Moreover, we provide handy estimates of the number of layers needed to have at least \$((1-\textbackslash{}varepsilon)\textbackslash{}cdot 100)\textbackslash\%\$ of the input signal energy be contained in the feature vector.},
  date = {2017-04-12},
  keywords = {Computer Science - Learning,Statistics - Machine Learning,Computer Science - Information Theory,Mathematics - Functional Analysis},
  author = {Wiatowski, Thomas and Grohs, Philipp and Bölcskei, Helmut},
  file = {/Users/fergalcotter/Dropbox/Papers/Wiatowski et al_2017_Energy Propagation in Deep Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/4573BKCM/1704.html}
}

@incollection{mairal_convolutional_2014,
  title = {Convolutional {{Kernel Networks}}},
  url = {http://papers.nips.cc/paper/5348-convolutional-kernel-networks.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 27},
  publisher = {{Curran Associates, Inc.}},
  date = {2014},
  pages = {2627--2635},
  author = {Mairal, Julien and Koniusz, Piotr and Harchaoui, Zaid and Schmid, Cordelia},
  editor = {Ghahramani, Z. and Welling, M. and Cortes, C. and Lawrence, N. D. and Weinberger, K. Q.},
  file = {/Users/fergalcotter/Dropbox/Papers/Mairal et al_2014_Convolutional Kernel Networks_2.pdf;/Users/fergalcotter/Zotero/storage/75IT2ZCP/5348-convolutional-kernel-networks.html}
}

@article{badrinarayanan_segnet:_2017,
  title = {{{SegNet}}: {{A Deep Convolutional Encoder}}-{{Decoder Architecture}} for {{Scene Segmentation}}},
  volume = {PP},
  issn = {0162-8828},
  doi = {10.1109/TPAMI.2016.2644615},
  shorttitle = {{{SegNet}}},
  abstract = {We present a novel and practical deep fully convolutional neural network architecture for semantic pixel-wise segmentation termed SegNet. This core trainable segmentation engine consists of an encoder network, a corresponding decoder network followed by a pixel-wise classification layer. The architecture of the encoder network is topologically identical to the 13 convolutional layers in the VGG16 network [1]. The role of the decoder network is to map the low resolution encoder feature maps to full input resolution feature maps for pixel-wise classification. The novelty of SegNet lies is in the manner in which the decoder upsamples its lower resolution input feature map(s). Specifically, the decoder uses pooling indices computed in the max-pooling step of the corresponding encoder to perform non-linear upsampling. This eliminates the need for learning to upsample. The upsampled maps are sparse and are then convolved with trainable filters to produce dense feature maps. We compare our proposed architecture with the widely adopted FCN [2] and also with the well known DeepLab-LargeFOV [3], DeconvNet [4] architectures. This comparison reveals the memory versus accuracy trade-off involved in achieving good segmentation performance. SegNet was primarily motivated by scene understanding applications. Hence, it is designed to be efficient both in terms of memory and computational time during inference. It is also significantly smaller in the number of trainable parameters than other competing architectures and can be trained end-to-end using stochastic gradient descent. We also performed a controlled benchmark of SegNet and other architectures on both road scenes and SUN RGB-D indoor scene segmentation tasks. These quantitative assessments show that SegNet provides good performance with competitive inference time and most efficient inference memory-wise as compared to other architectures. We also provide a Caffe implementation of SegNet and a web demo at http://mi.eng.cam.a- .uk/projects/segnet/.},
  number = {99},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {2017},
  pages = {1-1},
  keywords = {Computer architecture,Neural networks,Training,image segmentation,Decoder,Decoding,Deep Convolutional Neural Networks,Encoder,Indoor Scenes,Pooling,Road Scenes,Roads,Semantic Pixel-Wise Segmentation,Semantics,Upsampling},
  author = {Badrinarayanan, V. and Kendall, A. and Cipolla, R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Badrinarayanan et al_2017_SegNet.pdf;/Users/fergalcotter/Zotero/storage/9TW59FXI/7803544.html}
}

@article{li_fully_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1611.07709},
  primaryClass = {cs},
  title = {Fully {{Convolutional Instance}}-Aware {{Semantic Segmentation}}},
  url = {http://arxiv.org/abs/1611.07709},
  abstract = {We present the first fully convolutional end-to-end solution for instance-aware semantic segmentation task. It inherits all the merits of FCNs for semantic segmentation and instance mask proposal. It performs instance mask prediction and classification jointly. The underlying convolutional representation is fully shared between the two sub-tasks, as well as between all regions of interest. The proposed network is highly integrated and achieves state-of-the-art performance in both accuracy and efficiency. It wins the COCO 2016 segmentation competition by a large margin. Code would be released at \textbackslash{}url\{https://github.com/daijifeng001/TA-FCN\}.},
  date = {2016-11-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Li, Yi and Qi, Haozhi and Dai, Jifeng and Ji, Xiangyang and Wei, Yichen},
  file = {/Users/fergalcotter/Dropbox/Papers/Li et al_2016_Fully Convolutional Instance-aware Semantic Segmentation.pdf;/Users/fergalcotter/Zotero/storage/7NCEF5CK/1611.html}
}

@article{anden_deep_2014,
  title = {Deep {{Scattering Spectrum}}},
  volume = {62},
  issn = {1053-587X},
  doi = {10.1109/TSP.2014.2326991},
  abstract = {A scattering transform defines a locally translation invariant representation which is stable to time-warping deformation. It extends MFCC representations by computing modulation spectrum coefficients of multiple orders, through cascades of wavelet convolutions and modulus operators. Second-order scattering coefficients characterize transient phenomena such as attacks and amplitude modulation. A frequency transposition invariant representation is obtained by applying a scattering transform along log-frequency. State-the-of-art classification results are obtained for musical genre and phone classification on GTZAN and TIMIT databases, respectively.},
  number = {16},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {2014-08},
  pages = {4114-4128},
  keywords = {Convolution,Scattering,Wavelet analysis,scattering transform,wavelet transforms,wavelets,signal representation,Frequency modulation,GTZAN database,MFCC,Spectrogram,TIMIT database,acoustic wave scattering,amplitude modulation,audio classification,audio signal processing,cepstral analysis,deep neural networks,deep scattering spectrum,frequency transposition invariant representation,mel-frequency cepstral coefficients,modulation spectrum,modulus operators,musical genre,phone classification,second-order scattering coefficients,signal classification,spectrum coefficients,time-warping deformation,transient phenomena,wavelet convolutions},
  author = {Andén, J. and Mallat, S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Andén_Mallat_2014_Deep Scattering Spectrum.pdf;/Users/fergalcotter/Zotero/storage/PNHGXRS3/6822556.html}
}

@article{wiatowski_discrete_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1605.08283},
  primaryClass = {cs, math, stat},
  title = {Discrete {{Deep Feature Extraction}}: {{A Theory}} and {{New Architectures}}},
  url = {http://arxiv.org/abs/1605.08283},
  shorttitle = {Discrete {{Deep Feature Extraction}}},
  abstract = {First steps towards a mathematical theory of deep convolutional neural networks for feature extraction were made---for the continuous-time case---in Mallat, 2012, and Wiatowski and B\textbackslash{}"olcskei, 2015. This paper considers the discrete case, introduces new convolutional neural network architectures, and proposes a mathematical framework for their analysis. Specifically, we establish deformation and translation sensitivity results of local and global nature, and we investigate how certain structural properties of the input signal are reflected in the corresponding feature vectors. Our theory applies to general filters and general Lipschitz-continuous non-linearities and pooling operators. Experiments on handwritten digit classification and facial landmark detection---including feature importance evaluation---complement the theoretical findings.},
  date = {2016-05-26},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning,Computer Science - Information Theory},
  author = {Wiatowski, Thomas and Tschannen, Michael and Stanić, Aleksandar and Grohs, Philipp and Bölcskei, Helmut},
  file = {/Users/fergalcotter/Dropbox/Papers/Wiatowski et al_2016_Discrete Deep Feature Extraction.pdf;/Users/fergalcotter/Zotero/storage/6T68HXQW/1605.html}
}

@article{kendall_what_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1703.04977},
  primaryClass = {cs},
  title = {What {{Uncertainties Do We Need}} in {{Bayesian Deep Learning}} for {{Computer Vision}}?},
  url = {http://arxiv.org/abs/1703.04977},
  abstract = {There are two major types of uncertainty one can model. Aleatoric uncertainty captures noise inherent in the observations. On the other hand, epistemic uncertainty accounts for uncertainty in the model -- uncertainty which can be explained away given enough data. Traditionally it has been difficult to model epistemic uncertainty in computer vision, but with new Bayesian deep learning tools this is now possible. We study the benefits of modeling epistemic vs. aleatoric uncertainty in Bayesian deep learning models for vision tasks. For this we present a Bayesian deep learning framework combining input-dependent aleatoric uncertainty together with epistemic uncertainty. We study models under the framework with per-pixel semantic segmentation and depth regression tasks. Further, our explicit uncertainty formulation leads to new loss functions for these tasks, which can be interpreted as learned attenuation. This makes the loss more robust to noisy data, also giving new state-of-the-art results on segmentation and depth regression benchmarks.},
  date = {2017-03-15},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Kendall, Alex and Gal, Yarin},
  file = {/Users/fergalcotter/Dropbox/Papers/Kendall_Gal_2017_What Uncertainties Do We Need in Bayesian Deep Learning for Computer Vision.pdf;/Users/fergalcotter/Zotero/storage/P9NX7B53/1703.html}
}

@article{wiatowski_topology_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1707.02711},
  primaryClass = {cs, math, stat},
  title = {Topology {{Reduction}} in {{Deep Convolutional Feature Extraction Networks}}},
  url = {http://arxiv.org/abs/1707.02711},
  abstract = {Deep convolutional neural networks (CNNs) used in practice employ potentially hundreds of layers and \$10\$,\$000\$s of nodes. Such network sizes entail significant computational complexity due to the large number of convolutions that need to be carried out; in addition, a large number of parameters needs to be learned and stored. Very deep and wide CNNs may therefore not be well suited to applications operating under severe resource constraints as is the case, e.g., in low-power embedded and mobile platforms. This paper aims at understanding the impact of CNN topology, specifically depth and width, on the network's feature extraction capabilities. We address this question for the class of scattering networks that employ either Weyl-Heisenberg filters or wavelets, the modulus non-linearity, and no pooling. The exponential feature map energy decay results in Wiatowski et al., 2017, are generalized to \$\textbackslash{}mathcal\{O\}(a\^\{-N\})\$, where an arbitrary decay factor \$a{$>$}1\$ can be realized through suitable choice of the Weyl-Heisenberg prototype function or the mother wavelet. We then show how networks of fixed (possibly small) depth \$N\$ can be designed to guarantee that \$((1-\textbackslash{}varepsilon)\textbackslash{}cdot 100)\textbackslash\%\$ of the input signal's energy are contained in the feature vector. Based on the notion of operationally significant nodes, we characterize, partly rigorously and partly heuristically, the topology-reducing effects of (effectively) band-limited input signals, band-limited filters, and feature map symmetries. Finally, for networks based on Weyl-Heisenberg filters, we determine the prototype function bandwidth that minimizes---for fixed network depth \$N\$---the average number of operationally significant nodes per layer.},
  date = {2017-07-10},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Statistics - Machine Learning,Computer Science - Information Theory,Mathematics - Functional Analysis},
  author = {Wiatowski, Thomas and Grohs, Philipp and Bölcskei, Helmut},
  file = {/Users/fergalcotter/Dropbox/Papers/Wiatowski et al_2017_Topology Reduction in Deep Convolutional Feature Extraction Networks.pdf;/Users/fergalcotter/Zotero/storage/9CHUFB78/1707.html}
}

@inproceedings{long_fully_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1411.4038},
  location = {{Boston, MA, USA}},
  title = {Fully {{Convolutional Networks}} for {{Semantic Segmentation}}},
  url = {http://arxiv.org/abs/1411.4038},
  abstract = {Convolutional networks are powerful visual models that yield hierarchies of features. We show that convolutional networks by themselves, trained end-to-end, pixels-to-pixels, exceed the state-of-the-art in semantic segmentation. Our key insight is to build "fully convolutional" networks that take input of arbitrary size and produce correspondingly-sized output with efficient inference and learning. We define and detail the space of fully convolutional networks, explain their application to spatially dense prediction tasks, and draw connections to prior models. We adapt contemporary classification networks (AlexNet, the VGG net, and GoogLeNet) into fully convolutional networks and transfer their learned representations by fine-tuning to the segmentation task. We then define a novel architecture that combines semantic information from a deep, coarse layer with appearance information from a shallow, fine layer to produce accurate and detailed segmentations. Our fully convolutional network achieves state-of-the-art segmentation of PASCAL VOC (20\% relative improvement to 62.2\% mean IU on 2012), NYUDv2, and SIFT Flow, while inference takes one third of a second for a typical image.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2015-06},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Long, Jonathan and Shelhamer, Evan and Darrell, Trevor},
  file = {/Users/fergalcotter/Dropbox/Papers/Long et al_2014_Fully Convolutional Networks for Semantic Segmentation_2.pdf;/Users/fergalcotter/Zotero/storage/J3C9FTHS/1411.html}
}

@article{kendall_bayesian_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1511.02680},
  primaryClass = {cs},
  title = {Bayesian {{SegNet}}: {{Model Uncertainty}} in {{Deep Convolutional Encoder}}-{{Decoder Architectures}} for {{Scene Understanding}}},
  url = {http://arxiv.org/abs/1511.02680},
  shorttitle = {Bayesian {{SegNet}}},
  abstract = {We present a deep learning framework for probabilistic pixel-wise semantic segmentation, which we term Bayesian SegNet. Semantic segmentation is an important tool for visual scene understanding and a meaningful measure of uncertainty is essential for decision making. Our contribution is a practical system which is able to predict pixel-wise class labels with a measure of model uncertainty. We achieve this by Monte Carlo sampling with dropout at test time to generate a posterior distribution of pixel class labels. In addition, we show that modelling uncertainty improves segmentation performance by 2-3\% across a number of state of the art architectures such as SegNet, FCN and Dilation Network, with no additional parametrisation. We also observe a significant improvement in performance for smaller datasets where modelling uncertainty is more effective. We benchmark Bayesian SegNet on the indoor SUN Scene Understanding and outdoor CamVid driving scenes datasets.},
  date = {2015-11-09},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Neural and Evolutionary Computing},
  author = {Kendall, Alex and Badrinarayanan, Vijay and Cipolla, Roberto},
  file = {/Users/fergalcotter/Dropbox/Papers/Kendall et al_2015_Bayesian SegNet.pdf;/Users/fergalcotter/Zotero/storage/ERWN6HS5/1511.html}
}

@incollection{ranzato_efficient_2007,
  title = {Efficient {{Learning}} of {{Sparse Representations}} with an {{Energy}}-{{Based Model}}},
  url = {http://papers.nips.cc/paper/3112-efficient-learning-of-sparse-representations-with-an-energy-based-model.pdf},
  booktitle = {Advances in {{Neural Information Processing Systems}} 19},
  publisher = {{MIT Press}},
  date = {2007},
  pages = {1137--1144},
  author = {aurelio Ranzato, Marc\textbackslash{}textquotesingle and Poultney, Christopher and Chopra, Sumit and Cun, Yann L.},
  editor = {Schölkopf, P. B. and Platt, J. C. and Hoffman, T.},
  file = {/Users/fergalcotter/Dropbox/Papers/Ranzato et al_2007_Efficient Learning of Sparse Representations with an Energy-Based Model.pdf;/Users/fergalcotter/Zotero/storage/C68BF8V8/3112-efficient-learning-of-sparse-representations-with-an-energy-based-model.html}
}

@inproceedings{ranzato_unsupervised_2007,
  title = {Unsupervised {{Learning}} of {{Invariant Feature Hierarchies}} with {{Applications}} to {{Object Recognition}}},
  doi = {10.1109/CVPR.2007.383157},
  abstract = {We present an unsupervised method for learning a hierarchy of sparse feature detectors that are invariant to small shifts and distortions. The resulting feature extractor consists of multiple convolution filters, followed by a feature-pooling layer that computes the max of each filter output within adjacent windows, and a point-wise sigmoid non-linearity. A second level of larger and more invariant features is obtained by training the same algorithm on patches of features from the first level. Training a supervised classifier on these features yields 0.64\% error on MNIST, and 54\% average recognition rate on Caltech 101 with 30 training samples per category. While the resulting architecture is similar to convolutional networks, the layer-wise unsupervised training procedure alleviates the over-parameterization problems that plague purely supervised learning procedures, and yields good performance with very few labeled training samples.},
  eventtitle = {2007 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  booktitle = {2007 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  date = {2007-06},
  pages = {1-8},
  keywords = {Computer architecture,Convolution,Gabor filters,computer vision,object detection,object recognition,supervised learning,unsupervised learning,Feature extraction,feature extractor,feature-pooling layer,invariant feature hierarchy,multiple convolution filters,Detectors},
  author = {Ranzato, M. and Huang, F. J. and Boureau, Y. L. and LeCun, Y.},
  file = {/Users/fergalcotter/Dropbox/Papers/Ranzato et al_2007_Unsupervised Learning of Invariant Feature Hierarchies with Applications to.pdf;/Users/fergalcotter/Zotero/storage/JSKDA74N/4270182.html}
}

@article{freeman_design_1991,
  title = {The {{Design}} and {{Use}} of {{Steerable Filters}}},
  volume = {13},
  issn = {0162-8828},
  url = {http://dx.doi.org/10.1109/34.93808},
  doi = {10.1109/34.93808},
  abstract = {The authors present an efficient architecture to synthesize filters of arbitrary orientations from linear combinations of basis filters, allowing one to adaptively steer a filter to any orientation, and to determine analytically the filter output as a function of orientation. Steerable filters may be designed in quadrature pairs to allow adaptive control over phase as well as orientation. The authors show how to design and steer the filters and present examples of their use in the analysis of orientation and phase, angularly adaptive filtering, edge detection, and shape from shading. One can also build a self-similar steerable pyramid representation. The same concepts can be generalized to the design of 3-D steerable filters.},
  number = {9},
  journaltitle = {IEEE Trans. Pattern Anal. Mach. Intell.},
  date = {1991-09},
  pages = {891--906},
  keywords = {edge detection,filtering and prediction theory,picture processing,adaptive filtering,adaptive filters,design,shape from shading,steerable filters},
  author = {Freeman, William T. and Adelson, Edward H.},
  file = {/Users/fergalcotter/Dropbox/Papers/Freeman_Adelson_1991_The Design and Use of Steerable Filters.pdf}
}

@article{mizrahi_linear_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1308.6342},
  primaryClass = {cs, stat},
  title = {Linear and {{Parallel Learning}} of {{Markov Random Fields}}},
  url = {http://arxiv.org/abs/1308.6342},
  abstract = {We introduce a new embarrassingly parallel parameter learning algorithm for Markov random fields with untied parameters which is efficient for a large class of practical models. Our algorithm parallelizes naturally over cliques and, for graphs of bounded degree, its complexity is linear in the number of cliques. Unlike its competitors, our algorithm is fully parallel and for log-linear models it is also data efficient, requiring only the local sufficient statistics of the data to estimate parameters.},
  date = {2013-08-28},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Mizrahi, Yariv Dror and Denil, Misha and de Freitas, Nando},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Dropbox/Papers/Mizrahi et al_2013_Linear and Parallel Learning of Markov Random Fields.pdf;/Users/fergalcotter/Zotero/storage/FVI5VM6A/1308.html}
}

@article{paulin_convolutional_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1603.00438},
  primaryClass = {cs},
  title = {Convolutional {{Patch Representations}} for {{Image Retrieval}}: An {{Unsupervised Approach}}},
  url = {http://arxiv.org/abs/1603.00438},
  shorttitle = {Convolutional {{Patch Representations}} for {{Image Retrieval}}},
  abstract = {Convolutional neural networks (CNNs) have recently received a lot of attention due to their ability to model local stationary structures in natural images in a multi-scale fashion, when learning all model parameters with supervision. While excellent performance was achieved for image classification when large amounts of labeled visual data are available, their success for un-supervised tasks such as image retrieval has been moderate so far. Our paper focuses on this latter setting and explores several methods for learning patch descriptors without supervision with application to matching and instance-level retrieval. To that effect, we propose a new family of convolutional descriptors for patch representation , based on the recently introduced convolutional kernel networks. We show that our descriptor, named Patch-CKN, performs better than SIFT as well as other convolutional networks learned by artificially introducing supervision and is significantly faster to train. To demonstrate its effectiveness, we perform an extensive evaluation on standard benchmarks for patch and image retrieval where we obtain state-of-the-art results. We also introduce a new dataset called RomePatches, which allows to simultaneously study descriptor performance for patch and image retrieval.},
  date = {2016-03-01},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Paulin, Mattis and Mairal, Julien and Douze, Matthijs and Harchaoui, Zaid and Perronnin, Florent and Schmid, Cordelia},
  file = {/Users/fergalcotter/Dropbox/Papers/Paulin et al_2016_Convolutional Patch Representations for Image Retrieval_2.pdf;/Users/fergalcotter/Zotero/storage/27GHV98Q/1603.html}
}

@incollection{sutton_introduction_2012,
  title = {An {{Introduction}} to {{Conditional Random Fields}}},
  volume = {4},
  isbn = {978-1-60198-573-6},
  url = {http://dx.doi.org/10.1561/2200000013},
  abstract = {Many tasks involve predicting a large number of variables that depend on each other as well as on other observed variables. Structured prediction methods are essentially a combination of classification and graphical modeling. They combine the ability of graphical models to compactly model multivariate data with the ability of classification methods to perform prediction using large sets of input features. This survey describes conditional random fields, a popular probabilistic method for structured prediction. CRFs have seen wide application in many areas, including natural language processing, computer vision, and bioinformatics. We describe methods for inference and parameter estimation for CRFs, including practical issues for implementing large-scale CRFs. We do not assume previous knowledge of graphical modeling, so this survey is intended to be useful to practitioners in a wide variety of fields.},
  booktitle = {Foundations and {{Trends}} in {{Machine Learning}}},
  publisher = {{Now Publishers}},
  date = {2012-04},
  author = {Sutton, Charles and McCallum, Andrew},
  file = {/Users/fergalcotter/Dropbox/Papers/Sutton_McCallum_2012_An Introduction to Conditional Random Fields.pdf}
}

@article{sabour_dynamic_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1710.09829},
  primaryClass = {cs},
  title = {Dynamic {{Routing Between Capsules}}},
  url = {http://arxiv.org/abs/1710.09829},
  abstract = {A capsule is a group of neurons whose activity vector represents the instantiation parameters of a specific type of entity such as an object or object part. We use the length of the activity vector to represent the probability that the entity exists and its orientation to represent the instantiation paramters. Active capsules at one level make predictions, via transformation matrices, for the instantiation parameters of higher-level capsules. When multiple predictions agree, a higher level capsule becomes active. We show that a discrimininatively trained, multi-layer capsule system achieves state-of-the-art performance on MNIST and is considerably better than a convolutional net at recognizing highly overlapping digits. To achieve these results we use an iterative routing-by-agreement mechanism: A lower-level capsule prefers to send its output to higher level capsules whose activity vectors have a big scalar product with the prediction coming from the lower-level capsule.},
  date = {2017-10-26},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Sabour, Sara and Frosst, Nicholas and Hinton, Geoffrey E.},
  file = {/Users/fergalcotter/Dropbox/Papers/Sabour et al_2017_Dynamic Routing Between Capsules.pdf;/Users/fergalcotter/Zotero/storage/VVKZ5ZAC/1710.html}
}

@article{su_one_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1710.08864},
  primaryClass = {cs, stat},
  title = {One Pixel Attack for Fooling Deep Neural Networks},
  url = {http://arxiv.org/abs/1710.08864},
  abstract = {Recent research has revealed that the output of Deep neural networks(DNN) is not continuous and very sensitive to tiny perturbation on the input vectors and accordingly several methods have been proposed for crafting effective perturbation against the networks. In this paper, we propose a novel method for optically calculating extremely small adversarial perturbation (few-pixels attack), based on differential evolution. It requires much less adversarial information and works with a broader classes of DNN models. The results show that 73.8\$\textbackslash\%\$ of the test images can be crafted to adversarial images with modification just on one pixel with 98.7\$\textbackslash\%\$ confidence on average. In addition, it is known that investigating the robustness problem of DNN can bring critical clues for understanding the geometrical features of the DNN decision map in high dimensional input space. The results of conducting few-pixels attack contribute quantitative measurements and analysis to the geometrical understanding from a different perspective compared to previous works.},
  date = {2017-10-24},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Statistics - Machine Learning},
  author = {Su, Jiawei and Vargas, Danilo Vasconcellos and Kouichi, Sakurai},
  file = {/Users/fergalcotter/Dropbox/Papers/Su et al_2017_One pixel attack for fooling deep neural networks.pdf;/Users/fergalcotter/Zotero/storage/98UNC6NQ/1710.html}
}

@inproceedings{hinton_transforming_2011,
  langid = {english},
  title = {Transforming {{Auto}}-{{Encoders}}},
  isbn = {978-3-642-21734-0 978-3-642-21735-7},
  url = {https://link.springer.com/chapter/10.1007/978-3-642-21735-7_6},
  doi = {10.1007/978-3-642-21735-7_6},
  abstract = {The artificial neural networks that are used to recognize shapes typically use one or more layers of learned feature detectors that produce scalar outputs. By contrast, the computer vision community uses complicated, hand-engineered features, like SIFT [6], that produce a whole vector of outputs including an explicit representation of the pose of the feature. We show how neural networks can be used to learn features that output a whole vector of instantiation parameters and we argue that this is a much more promising way of dealing with variations in position, orientation, scale and lighting than the methods currently employed in the neural networks community. It is also more promising than the hand-engineered features currently used in computer vision because it provides an efficient way of adapting the features to the domain.},
  eventtitle = {International {{Conference}} on {{Artificial Neural Networks}}},
  booktitle = {Artificial {{Neural Networks}} and {{Machine Learning}} – {{ICANN}} 2011},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer, Berlin, Heidelberg}},
  urldate = {2017-11-01},
  date = {2011-06-14},
  pages = {44-51},
  author = {Hinton, Geoffrey E. and Krizhevsky, Alex and Wang, Sida D.},
  file = {/Users/fergalcotter/Dropbox/Papers/Hinton et al_2011_Transforming Auto-Encoders.pdf;/Users/fergalcotter/Zotero/storage/U9F7S3KS/10.html}
}

@article{jacobsen_multiscale_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1703.04140},
  primaryClass = {cs, stat},
  title = {Multiscale {{Hierarchical Convolutional Networks}}},
  url = {http://arxiv.org/abs/1703.04140},
  abstract = {Deep neural network algorithms are difficult to analyze because they lack structure allowing to understand the properties of underlying transforms and invariants. Multiscale hierarchical convolutional networks are structured deep convolutional networks where layers are indexed by progressively higher dimensional attributes, which are learned from training data. Each new layer is computed with multidimensional convolutions along spatial and attribute variables. We introduce an efficient implementation of such networks where the dimensionality is progressively reduced by averaging intermediate layers along attribute indices. Hierarchical networks are tested on CIFAR image data bases where they obtain comparable precisions to state of the art networks, with much fewer parameters. We study some properties of the attributes learned from these databases.},
  date = {2017-03-12},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Jacobsen, Jörn-Henrik and Oyallon, Edouard and Mallat, Stéphane and Smeulders, Arnold W. M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Jacobsen et al_2017_Multiscale Hierarchical Convolutional Networks_2.pdf;/Users/fergalcotter/Zotero/storage/C9PDDRCD/1703.html}
}

@article{francesca_spectral_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1611.05378},
  primaryClass = {cs, stat},
  title = {Spectral {{Convolution Networks}}},
  url = {http://arxiv.org/abs/1611.05378},
  abstract = {Previous research has shown that computation of convolution in the frequency domain provides a significant speedup versus traditional convolution network implementations. However, this performance increase comes at the expense of repeatedly computing the transform and its inverse in order to apply other network operations such as activation, pooling, and dropout. We show, mathematically, how convolution and activation can both be implemented in the frequency domain using either the Fourier or Laplace transformation. The main contributions are a description of spectral activation under the Fourier transform and a further description of an efficient algorithm for computing both convolution and activation under the Laplace transform. By computing both the convolution and activation functions in the frequency domain, we can reduce the number of transforms required, as well as reducing overall complexity. Our description of a spectral activation function, together with existing spectral analogs of other network functions may then be used to compose a fully spectral implementation of a convolution network.},
  date = {2016-11-16},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Francesca, Maria and Hughes, Arthur and Gregg, David},
  file = {/Users/fergalcotter/Dropbox/Papers/Francesca et al_2016_Spectral Convolution Networks.pdf;/Users/fergalcotter/Zotero/storage/6U6RMPZX/1611.html}
}

@article{shen_convolutional_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1704.02071},
  primaryClass = {cs},
  title = {Convolutional {{Neural Pyramid}} for {{Image Processing}}},
  url = {http://arxiv.org/abs/1704.02071},
  abstract = {We propose a principled convolutional neural pyramid (CNP) framework for general low-level vision and image processing tasks. It is based on the essential finding that many applications require large receptive fields for structure understanding. But corresponding neural networks for regression either stack many layers or apply large kernels to achieve it, which is computationally very costly. Our pyramid structure can greatly enlarge the field while not sacrificing computation efficiency. Extra benefit includes adaptive network depth and progressive upsampling for quasi-realtime testing on VGA-size input. Our method profits a broad set of applications, such as depth/RGB image restoration, completion, noise/artifact removal, edge refinement, image filtering, image enhancement and colorization.},
  date = {2017-04-06},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Shen, Xiaoyong and Chen, Ying-Cong and Tao, Xin and Jia, Jiaya},
  file = {/Users/fergalcotter/Dropbox/Papers/Shen et al_2017_Convolutional Neural Pyramid for Image Processing.pdf;/Users/fergalcotter/Zotero/storage/UMKUQUIC/1704.html}
}

@report{cotter_understanding_2016,
  title = {Understanding and {{Building Deep Convolutional Networks}} with {{Wavelets}}},
  date = {2016-08},
  pages = {126},
  author = {Cotter, Fergal},
  file = {/Users/fergalcotter/Dropbox/Papers/Cotter_2016_Understanding and Building Deep Convolutional Networks with Wavelets.pdf}
}

@inproceedings{wietzke_geometry_2009,
  location = {{Miami, Florida}},
  title = {The Geometry of {{2D}} Image Signals},
  doi = {10.1109/CVPR.2009.5206784},
  abstract = {This paper covers a fundamental problem of local phase based signal processing: the isotropic generalization of the classical 1D analytic signal to two dimensions. The well known analytic signal enables the analysis of local phase and amplitude information of 1D signals. Local phase, amplitude and additional orientation information can be extracted by the 2D monogenic signal with the restriction to the subclass of intrinsically one dimensional signals. In case of 2D image signals the monogenic signal enables the rotationally invariant analysis of lines and edges. In this work we present the 2D analytic signal as a novel generalization of both the analytic signal and the 2D monogenic signal. In case of 2D image signals the 2D analytic signal enables the isotropic analysis of lines, edges, corners and junctions in one unified framework. Furthermore, we show that 2D signals exist per se in a 3D projective subspace of the homogeneous conformal space which delivers a descriptive geometric interpretation of signals providing new insights on the relation of geometry and 2D signals.},
  eventtitle = {2009 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  booktitle = {Proceedings of 2009 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  publisher = {{IEEE}},
  date = {2009-06},
  pages = {1690-1697},
  keywords = {1D analytic signal,2D analytic signal,2D image signal,2D monogenic signal,3D projective subspace,computational geometry,descriptive geometric signal interpretation,Geometry,homogeneous conformal space,image processing,invariant analysis,isotropic analysis,isotropic generalization,phase based signal processing},
  author = {Wietzke, L. and Sommer, G. and Fleischmann, O.},
  file = {/Users/fergalcotter/Dropbox/Papers/Wietzke et al_2009_The geometry of 2D image signals.pdf;/Users/fergalcotter/Zotero/storage/6CHP7P3H/5206784.html}
}

@article{bridge_introduction_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1703.09199},
  primaryClass = {cs},
  title = {Introduction {{To The Monogenic Signal}}},
  url = {http://arxiv.org/abs/1703.09199},
  abstract = {The monogenic signal is an image analysis methodology that was introduced by Felsberg and Sommer in 2001 and has been employed for a variety of purposes in image processing and computer vision research. In particular, it has been found to be useful in the analysis of ultrasound imagery in several research scenarios mostly in work done within the BioMedIA lab at Oxford. However, the literature on the monogenic signal can be difficult to penetrate due to the lack of a single resource to explain the various principles from basics. The purpose of this document is therefore to introduce the principles, purpose, applications, and limitations of the methodology. It assumes some background knowledge from the fields of image and signal processing, in particular a good knowledge of Fourier transforms as applied to signals and images. We will not attempt to provide a thorough math- ematical description or derivation of the monogenic signal, but rather focus on developing an intuition for understanding and using the methodology and refer the reader elsewhere for a more mathematical treatment.},
  date = {2017-03-27},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Bridge, Christopher P.},
  file = {/Users/fergalcotter/Dropbox/Papers/Bridge_2017_Introduction To The Monogenic Signal.pdf;/Users/fergalcotter/Zotero/storage/MSVJQ4CE/1703.html}
}

@inproceedings{liu_2d/3d_2012,
  location = {{Providence, Rhode Island}},
  title = {{{2D}}/{{3D}} Rotation-Invariant Detection Using Equivariant Filters and Kernel Weighted Mapping},
  doi = {10.1109/CVPR.2012.6247766},
  abstract = {In many vision problems, rotation-invariant analysis is necessary or preferred. Popular solutions are mainly based on pose normalization or brute-force learning, neglecting the intrinsic properties of rotations. In this paper, we present a rotation invariant detection approach built on the equivariant filter framework, with a new model for learning the filtering behavior. The special properties of the harmonic basis, which is related to the irreducible representation of the rotation group, directly guarantees rotation invariance of the whole approach. The proposed kernel weighted mapping ensures high learning capability while respecting the invariance constraint. We demonstrate its performance on 2D object detection with in-plane rotations, and a 3D application on rotation-invariant landmark detection in microscopic volumetric data.},
  eventtitle = {2012 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2012 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2012-06},
  pages = {917-924},
  keywords = {2D object detection,2D rotation-invariant detection,3D rotation-invariant detection,brute-force learning,Computational modeling,computer vision,equivariant filter,Estimation,Feature extraction,filtering theory,Harmonic analysis,harmonic basis,intrinsic property,Kernel,kernel weighted mapping,microscopic volumetric data,pose estimation,pose normalization,rotation-invariant landmark detection,Training,Vectors,vision problem},
  author = {Liu, K. and Wang, Q. and Driever, W. and Ronneberger, O.},
  file = {/Users/fergalcotter/Dropbox/Papers/Liu et al_2012_2D-3D rotation-invariant detection using equivariant filters and kernel.pdf;/Users/fergalcotter/Zotero/storage/FKRD2EZV/6247766.html}
}

@article{luan_gabor_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.01450},
  primaryClass = {cs},
  title = {Gabor {{Convolutional Networks}}},
  url = {http://arxiv.org/abs/1705.01450},
  abstract = {Steerable properties dominate the design of traditional filters, e.g., Gabor filters, and endow features the capability of dealing with spatial transformations. However, such excellent properties have not been well explored in the popular deep convolutional neural networks (DCNNs). In this paper, we propose a new deep model, termed Gabor Convolutional Networks (GCNs or Gabor CNNs), which incorporates Gabor filters into DCNNs to enhance the resistance of deep learned features to the orientation and scale changes. By only manipulating the basic element of DCNNs based on Gabor filters, i.e., the convolution operator, GCNs can be easily implemented and are compatible with any popular deep learning architecture. Experimental results demonstrate the super capability of our algorithm in recognizing objects, where the scale and rotation changes occur frequently. The proposed GCNs have much fewer learnable network parameters, and thus is easier to train with an end-to-end pipeline. To encourage further developments, the source code is released at Github.},
  date = {2017-05-03},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Luan, Shangzhen and Zhang, Baochang and Chen, Chen and Cao, Xianbin and Han, Jungong and Liu, Jianzhuang},
  file = {/Users/fergalcotter/Dropbox/Papers/Luan et al_2017_Gabor Convolutional Networks.pdf;/Users/fergalcotter/Zotero/storage/GTKK6688/1705.html}
}

@book{granlund_signal_1995,
  location = {{Norwell, MA, USA}},
  title = {Signal {{Processing}} for {{Computer Vision}}},
  isbn = {978-0-7923-9530-0},
  abstract = {From the Publisher:Signal Processing for Computer Vision provides a unique and thorough treatment of the signal processing aspects of filters and operators for low level computer vision. Computer Vision has progressed considerably over the years. From methods only applicable to simple images, it has developed to deal with increasingly complex scenes, volumes and time sequences. A substantial part of this book deals with the problem of designing models that can be used for several purposes with computer vision. These partial models have some general properties of invariance generation and generality in model generation. Signal Processing for Computer Vision is the first book to give a unified treatment of representation and filtering of higher order data, such as vectors and tensors in multidimensional space. Included is a systematic organisation for the implementation of complex models in a hierarchical modular structure and novel material on adaptive filtering using tensor data representation. Signal Processing for Computer Vision is intended for final year undergraduate and graduate students as well as engineers and researchers in the field of computer vision and image processing.},
  publisher = {{Kluwer Academic Publishers}},
  date = {1995},
  author = {Granlund, GFosta H. and Knutsson, Hans},
  file = {/Users/fergalcotter/Dropbox/Papers/Granlund_Knutsson_1995_Signal Processing for Computer Vision.pdf}
}

@article{felsberg_monogenic_2001,
  title = {The Monogenic Signal},
  volume = {49},
  issn = {1053-587X},
  doi = {10.1109/78.969520},
  abstract = {This paper introduces a two-dimensional (2-D) generalization of the analytic signal. This novel approach is based on the Riesz transform, which is used instead of the Hilbert transform. The combination of a 2-D signal with the Riesz transformed one yields a sophisticated 2-D analytic signal: the monogenic signal. The approach is derived analytically from irrotational and solenoidal vector fields. Based on local amplitude and local phase, an appropriate local signal representation that preserves the split of identity, i.e., the invariance-equivariance property of signal decomposition, is presented. This is one of the central properties of the one-dimensional (1-D) analytic signal that decomposes a signal into structural and energetic information. We show that further properties of the analytic signal concerning symmetry, energy, allpass transfer function, and orthogonality are also preserved, and we compare this with the behavior of other approaches for a 2-D analytic signal. As a central topic of this paper, a geometric phase interpretation that is based on the relation between the 1-D analytic signal and the 2-D monogenic signal established by the Radon (1986) transform is introduced. Possible applications of this relationship are sketched, and references to other applications of the monogenic signal are given},
  number = {12},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {2001-12},
  pages = {3136-3144},
  keywords = {Hilbert transforms,Image edge detection,Information analysis,Multidimensional signal processing,image processing,information filtering,signal processing,signal representation,Frequency estimation,Hilbert transform,Riesz transform,monogenic signal,1D analytic signal,2D analytic signal,2D monogenic signal,Band pass filters,Phase measurement,Radar signal processing,Radon transform,Radon transforms,Signal analysis,allpass transfer function,geometric phase interpretation,irrotational vector fields,local amplitude representation,local phase representation,local signal representation,signal energy,signal symmetry,solenoidal vector fields},
  author = {Felsberg, M. and Sommer, G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Felsberg_Sommer_2001_The monogenic signal.pdf;/Users/fergalcotter/Zotero/storage/3P2FIQ6A/969520.html}
}

@thesis{forshaw_vehicle_2017,
  location = {{Cambridge, UK}},
  title = {Vehicle {{Re}}-Identification {{Using Phase Functions}} of {{Complex Wavelets}}},
  pagetotal = {244},
  institution = {{University of Cambridge}},
  type = {PhD Thesis},
  date = {2017-03},
  author = {Forshaw, Stewart},
  file = {/Users/fergalcotter/Dropbox/Papers/Stewart Forshaw_2017_Vehicle Re-identification Using Phase Functions of Complex Wavelets.pdf}
}

@article{graves_generating_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1308.0850},
  primaryClass = {cs},
  title = {Generating {{Sequences With Recurrent Neural Networks}}},
  url = {http://arxiv.org/abs/1308.0850},
  abstract = {This paper shows how Long Short-term Memory recurrent neural networks can be used to generate complex sequences with long-range structure, simply by predicting one data point at a time. The approach is demonstrated for text (where the data are discrete) and online handwriting (where the data are real-valued). It is then extended to handwriting synthesis by allowing the network to condition its predictions on a text sequence. The resulting system is able to generate highly realistic cursive handwriting in a wide variety of styles.},
  date = {2013-08-04},
  keywords = {Computer Science - Neural and Evolutionary Computing,Computer Science - Computation and Language},
  author = {Graves, Alex},
  file = {/Users/fergalcotter/Dropbox/Papers/Graves_2013_Generating Sequences With Recurrent Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/G6VSHCBR/1308.html}
}

@online{_home_????,
  title = {Home - Colah's Blog},
  url = {https://colah.github.io/},
  urldate = {2017-11-14}
}

@article{socher_recursive_2013,
  title = {Recursive Deep Models for Semantic Compositionality over a Sentiment Treebank},
  volume = {1631},
  abstract = {Semantic word spaces have been very useful but cannot express the meaning of longer phrases in a principled way. Further progress towards understanding compositionality in tasks such as sentiment detection requires richer supervised training and evaluation resources and more powerful models of composition. To remedy this, we introduce a Sentiment Treebank. It includes fine grained sentiment labels for 215,154 phrases in the parse trees of 11,855 sentences and presents new challenges for sentiment composition-ality. To address them, we introduce the Recursive Neural Tensor Network. When trained on the new treebank, this model outperforms all previous methods on several metrics. It pushes the state of the art in single sentence positive/negative classification from 80\% up to 85.4\%. The accuracy of predicting fine-grained sentiment labels for all phrases reaches 80.7\%, an improvement of 9.7\% over bag of features baselines. Lastly, it is the only model that can accurately capture the effects of negation and its scope at various tree levels for both positive and negative phrases.},
  journaltitle = {EMNLP},
  date = {2013-01-01},
  pages = {1631-1642},
  author = {Socher, R and Perelygin, A and Wu, J.Y. and Chuang, J and Manning, C.D. and Ng, A.Y. and Potts, C},
  file = {/Users/fergalcotter/Dropbox/Papers/Socher et al_2013_Recursive deep models for semantic compositionality over a sentiment treebank.pdf;/Users/fergalcotter/Dropbox/Papers/Socher et al_2013_Recursive deep models for semantic compositionality over a sentiment treebank.pdf;/Users/fergalcotter/Zotero/storage/PIJ9SK3F/284039049_Recursive_deep_models_for_semantic_compositionality_over_a_sentiment_treebank.html}
}

@inproceedings{juefei-xu_local_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1608.06049},
  primaryClass = {cs},
  title = {Local {{Binary Convolutional Neural Networks}}},
  url = {http://arxiv.org/abs/1608.06049},
  abstract = {We propose local binary convolution (LBC), an efficient alternative to convolutional layers in standard convolutional neural networks (CNN). The design principles of LBC are motivated by local binary patterns (LBP). The LBC layer comprises of a set of fixed sparse pre-defined binary convolutional filters that are not updated during the training process, a non-linear activation function and a set of learnable linear weights. The linear weights combine the activated filter responses to approximate the corresponding activated filter responses of a standard convolutional layer. The LBC layer affords significant parameter savings, 9x to 169x in the number of learnable parameters compared to a standard convolutional layer. Furthermore, the sparse and binary nature of the weights also results in up to 9x to 169x savings in model size compared to a standard convolutional layer. We demonstrate both theoretically and experimentally that our local binary convolution layer is a good approximation of a standard convolutional layer. Empirically, CNNs with LBC layers, called local binary convolutional neural networks (LBCNN), achieves performance parity with regular CNNs on a range of visual datasets (MNIST, SVHN, CIFAR-10, and ImageNet) while enjoying significant computational savings.},
  date = {2016-08-22},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning},
  author = {Juefei-Xu, Felix and Boddeti, Vishnu Naresh and Savvides, Marios},
  file = {/Users/fergalcotter/Dropbox/Papers/Juefei-Xu et al_2016_Local Binary Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/4BIPIQZK/1608.html}
}

@article{courbariaux_binarized_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1602.02830},
  primaryClass = {cs},
  title = {Binarized {{Neural Networks}}: {{Training Deep Neural Networks}} with {{Weights}} and {{Activations Constrained}} to +1 or -1},
  url = {http://arxiv.org/abs/1602.02830},
  shorttitle = {Binarized {{Neural Networks}}},
  abstract = {We introduce a method to train Binarized Neural Networks (BNNs) - neural networks with binary weights and activations at run-time. At training-time the binary weights and activations are used for computing the parameters gradients. During the forward pass, BNNs drastically reduce memory size and accesses, and replace most arithmetic operations with bit-wise operations, which is expected to substantially improve power-efficiency. To validate the effectiveness of BNNs we conduct two sets of experiments on the Torch7 and Theano frameworks. On both, BNNs achieved nearly state-of-the-art results over the MNIST, CIFAR-10 and SVHN datasets. Last but not least, we wrote a binary matrix multiplication GPU kernel with which it is possible to run our MNIST BNN 7 times faster than with an unoptimized GPU kernel, without suffering any loss in classification accuracy. The code for training and running our BNNs is available on-line.},
  date = {2016-02-08},
  keywords = {Computer Science - Learning},
  author = {Courbariaux, Matthieu and Hubara, Itay and Soudry, Daniel and El-Yaniv, Ran and Bengio, Yoshua},
  file = {/Users/fergalcotter/Dropbox/Papers/Courbariaux et al_2016_Binarized Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/IB8JECRD/1602.html}
}

@inproceedings{_discrete-valued_????,
  title = {{{DISCRETE}}-{{VALUED NEURAL NETWORKS USING VARIATIONAL INFERENCE}}},
  file = {/Users/fergalcotter/Zotero/storage/2W6UVMQ3/pdf.pdf}
}

@article{louizos_bayesian_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.08665},
  primaryClass = {cs, stat},
  title = {Bayesian {{Compression}} for {{Deep Learning}}},
  url = {http://arxiv.org/abs/1705.08665},
  abstract = {Compression and computational efficiency in deep learning have become a problem of great significance. In this work, we argue that the most principled and effective way to attack this problem is by adopting a Bayesian point of view, where through sparsity inducing priors we prune large parts of the network. We introduce two novelties in this paper: 1) we use hierarchical priors to prune nodes instead of individual weights, and 2) we use the posterior uncertainties to determine the optimal fixed point precision to encode the weights. Both factors significantly contribute to achieving the state of the art in terms of compression rates, while still staying competitive with methods designed to optimize for speed or energy efficiency.},
  date = {2017-05-24},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Louizos, Christos and Ullrich, Karen and Welling, Max},
  file = {/Users/fergalcotter/Dropbox/Papers/Louizos et al_2017_Bayesian Compression for Deep Learning.pdf;/Users/fergalcotter/Zotero/storage/RUTB5VDB/1705.html}
}

@online{_new_2016,
  title = {New Paper: "{{Safely}} Interruptible Agents"},
  url = {https://intelligence.org/2016/06/01/new-paper-safely-interruptible-agents/},
  shorttitle = {New Paper},
  abstract = {Google DeepMind Research Scientist Laurent Orseau and MIRI Research Associate Stuart Armstrong have written a new paper on error-tolerant agent designs, “Safely interruptible agents.” The paper is forthcoming at the 32nd Conference on Uncertainty in Artificial Intelligence. Abstract: Reinforcement learning agents interacting with a complex environment like the real world are unlikely to behave optimally...  Read more »},
  journaltitle = {Machine Intelligence Research Institute},
  urldate = {2017-12-06},
  date = {2016-06-01T23:58:38+00:00},
  file = {/Users/fergalcotter/Zotero/storage/BC2WPR2J/new-paper-safely-interruptible-agents.html}
}

@article{armstrong_good_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1711.05541},
  primaryClass = {cs},
  title = {Good and Safe Uses of {{AI Oracles}}},
  url = {http://arxiv.org/abs/1711.05541},
  abstract = {An Oracle is a design for potentially high power artificial intelligences (AIs), where the AI is made safe by restricting it to only answer questions. Unfortunately most designs cause the Oracle to be motivated to manipulate humans with the contents of their answers, and Oracles of potentially high intelligence might be very successful at this. Solving the problem, without compromising the accuracy of the answer, is tricky. This paper reduces the issue to a cryptographic-style problem of Alice ensuring that her Oracle answers her questions while not providing key information to an eavesdropping Eve. Two Oracle designs solve this problem, one counterfactual (the Oracle answers as if it expected its answer to never be read) and one on-policy (limited by the quantity of information it can transmit).},
  date = {2017-11-15},
  keywords = {Computer Science - Artificial Intelligence},
  author = {Armstrong, Stuart},
  file = {/Users/fergalcotter/Dropbox/Papers/Armstrong_2017_Good and safe uses of AI Oracles.pdf;/Users/fergalcotter/Zotero/storage/S4K3D8BD/1711.html}
}

@inproceedings{chaux_2d_2007,
  location = {{Honolulu, HI}},
  title = {{{2D Dual}}-{{Tree Complex Biorthogonal M}}-{{Band Wavelet Transform}}},
  volume = {3},
  doi = {10.1109/ICASSP.2007.366812},
  abstract = {Dual-tree wavelet transforms have recently gained popularity since they provide low-redundancy directional analyses of images. In our recent work, dyadic real dual-tree decompositions have been extended to the M-band case, so adding much flexibility to this analysis tool. In this work, we propose to further extend this framework on two fronts by considering (i) biorthogonal and (ii) complex M-band dual-tree decompositions. Denoising results are finally provided to demonstrate the validity of the proposed design rules.},
  eventtitle = {2007 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
  booktitle = {Proceedings of 2007 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
  publisher = {{IEEE}},
  date = {2007-04},
  pages = {III-845-III-848},
  keywords = {2D dual-tree complex biorthogonal M-band wavelet transform,denoising results,filter bank,Frequency,Gaussian noise,Hilbert transforms,Image analysis,Image coding,image denoising,image processing,image restoration,low-redundancy directional analyses,Noise reduction,Wavelet analysis,wavelet transforms},
  author = {Chaux, C. and Pesquet, J. C. and Duval, L.},
  file = {/Users/fergalcotter/Dropbox/Papers/Chaux et al_2007_2D Dual-Tree Complex Biorthogonal M-Band Wavelet Transform.pdf;/Users/fergalcotter/Zotero/storage/HB595UTH/4217842.html}
}

@article{yu_theory_2008,
  title = {Theory of {{Dual}}-{{Tree Complex Wavelets}}},
  volume = {56},
  issn = {1053-587X},
  doi = {10.1109/TSP.2008.925970},
  abstract = {We study analyticity of the complex wavelets in Kingsbury's dual-tree wavelet transform. A notion of scaling transformation function that defines the relationship between the primal and dual scaling functions is introduced and studied in detail. The analyticity property is examined and dealt with via the transformation function. We separate analyticity from other properties of the wavelet such as orthogonality or biorthogonality. This separation allows a unified treatment of analyticity for general setting of the wavelet system, which can be dyadic or M-band; orthogonal or biorthogonal; scalar or multiple; bases or frames. We show that analyticity of the complex wavelets can be characterized by scaling filter relationship and wavelet filter relationship via the scaling transformation function. For general orthonormal wavelets and dyadic biorthogonal scalar wavelets, the transformation function is shown to be paraunitary and has a linear phase delay of omega/2 in (0, 2pi).},
  number = {9},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {2008-09},
  pages = {4263-4273},
  keywords = {Dual-tree complex wavelets,trees (mathematics),wavelet transforms,Hilbert transform,analyticity property,dual-tree complex wavelet,dyadic biorthogonal scalar wavelet,orthonormal wavelet,scaling transformation function,filter banks},
  author = {Yu, R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Yu_2008_Theory of Dual-Tree Complex Wavelets.pdf;/Users/fergalcotter/Zotero/storage/J3SXR5Q2/4527203.html}
}

@article{xie_aggregated_2016-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1611.05431},
  primaryClass = {cs},
  title = {Aggregated {{Residual Transformations}} for {{Deep Neural Networks}}},
  url = {http://arxiv.org/abs/1611.05431},
  abstract = {We present a simple, highly modularized network architecture for image classification. Our network is constructed by repeating a building block that aggregates a set of transformations with the same topology. Our simple design results in a homogeneous, multi-branch architecture that has only a few hyper-parameters to set. This strategy exposes a new dimension, which we call "cardinality" (the size of the set of transformations), as an essential factor in addition to the dimensions of depth and width. On the ImageNet-1K dataset, we empirically show that even under the restricted condition of maintaining complexity, increasing cardinality is able to improve classification accuracy. Moreover, increasing cardinality is more effective than going deeper or wider when we increase the capacity. Our models, named ResNeXt, are the foundations of our entry to the ILSVRC 2016 classification task in which we secured 2nd place. We further investigate ResNeXt on an ImageNet-5K set and the COCO detection set, also showing better results than its ResNet counterpart. The code and models are publicly available online.},
  date = {2016-11-16},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Xie, Saining and Girshick, Ross and Dollár, Piotr and Tu, Zhuowen and He, Kaiming},
  file = {/Users/fergalcotter/Dropbox/Papers/Xie et al_2016_Aggregated Residual Transformations for Deep Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/72D3UKFP/1611.html}
}

@inproceedings{gregor_learning_2010,
  location = {{USA}},
  title = {Learning {{Fast Approximations}} of {{Sparse Coding}}},
  isbn = {978-1-60558-907-7},
  url = {http://dl.acm.org/citation.cfm?id=3104322.3104374},
  abstract = {In Sparse Coding (SC), input vectors are reconstructed using a sparse linear combination of basis vectors. SC has become a popular method for extracting features from data. For a given input, SC minimizes a quadratic reconstruction error with an L1 penalty term on the code. The process is often too slow for applications such as real-time pattern recognition. We proposed two versions of a very fast algorithm that produces approximate estimates of the sparse code that can be used to compute good visual features, or to initialize exact iterative algorithms. The main idea is to train a non-linear, feed-forward predictor with a specific architecture and a fixed depth to produce the best possible approximation of the sparse code. A version of the method, which can be seen as a trainable version of Li and Osher's coordinate descent method, is shown to produce approximate solutions with 10 times less computation than Li and Os-her's for the same approximation error. Unlike previous proposals for sparse code predictors, the system allows a kind of approximate "explaining away" to take place during inference. The resulting predictor is differentiable and can be included into globally-trained recognition systems.},
  booktitle = {Proceedings of the 27th {{International Conference}} on {{International Conference}} on {{Machine Learning}}},
  series = {{{ICML}}'10},
  publisher = {{Omnipress}},
  date = {2010},
  pages = {399--406},
  author = {Gregor, Karol and LeCun, Yann}
}

@article{ilyas_query-efficient_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1712.07113},
  primaryClass = {cs, stat},
  title = {Query-{{Efficient Black}}-Box {{Adversarial Examples}}},
  url = {http://arxiv.org/abs/1712.07113},
  abstract = {Current neural network-based image classifiers are susceptible to adversarial examples, even in the black-box setting, where the attacker is limited to query access without access to gradients. Previous methods --- substitute networks and coordinate-based finite-difference methods --- are either unreliable or query-inefficient, making these methods impractical for certain problems. We introduce a new method for reliably generating adversarial examples under more restricted, practical black-box threat models. First, we apply natural evolution strategies to perform black-box attacks using two to three orders of magnitude fewer queries than previous methods. Second, we introduce a new algorithm to perform targeted adversarial attacks in the partial-information setting, where the attacker only has access to a limited number of target classes. Using these techniques, we successfully perform the first targeted adversarial attack against a commercially deployed machine learning system, the Google Cloud Vision API, in the partial information setting.},
  date = {2017-12-19},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Statistics - Machine Learning},
  author = {Ilyas, Andrew and Engstrom, Logan and Athalye, Anish and Lin, Jessy},
  file = {/Users/fergalcotter/Dropbox/Papers/Ilyas et al_2017_Query-Efficient Black-box Adversarial Examples.pdf;/Users/fergalcotter/Zotero/storage/DCD7CUT9/1712.html}
}

@article{gorban_approximation_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1506.04631},
  title = {Approximation with {{Random Bases}}: {{Pro}} et {{Contra}}},
  volume = {364-365},
  issn = {00200255},
  url = {http://arxiv.org/abs/1506.04631},
  doi = {10.1016/j.ins.2015.09.021},
  shorttitle = {Approximation with {{Random Bases}}},
  abstract = {In this work we discuss the problem of selecting suitable approximators from families of parameterized elementary functions that are known to be dense in a Hilbert space of functions. We consider and analyze published procedures, both randomized and deterministic, for selecting elements from these families that have been shown to ensure the rate of convergence in \$L\_2\$ norm of order \$O(1/N)\$, where \$N\$ is the number of elements. We show that both randomized and deterministic procedures are successful if additional information about the families of functions to be approximated is provided. In the absence of such additional information one may observe exponential growth of the number of terms needed to approximate the function and/or extreme sensitivity of the outcome of the approximation to parameters. Implications of our analysis for applications of neural networks in modeling and control are illustrated with examples.},
  journaltitle = {Information Sciences},
  date = {2016-10},
  pages = {129-145},
  keywords = {Computer Science - Numerical Analysis,Computer Science - Discrete Mathematics,41A45; 41A45; 90C59; 92B20; 68W20},
  author = {Gorban, Alexander N. and Tyukin, Ivan Yu and Prokhorov, Danil V. and Sofeikov, Konstantin I.},
  file = {/Users/fergalcotter/Dropbox/Papers/Gorban et al_2016_Approximation with Random Bases.pdf;/Users/fergalcotter/Zotero/storage/DP97T2HC/1506.html}
}

@article{chaux_image_2006,
  title = {Image Analysis Using a Dual-Tree {{M}}-Band Wavelet Transform},
  volume = {15},
  issn = {1057-7149},
  doi = {10.1109/TIP.2006.875178},
  abstract = {We propose a two-dimensional generalization to the M-band case of the dual-tree decomposition structure (initially proposed by Kingsbury and further investigated by Selesnick) based on a Hilbert pair of wavelets. We particularly address: 1) the construction of the dual basis and 2) the resulting directional analysis. We also revisit the necessary pre-processing stage in the M-band case. While several reconstructions are possible because of the redundancy of the representation, we propose a new optimal signal reconstruction technique, which minimizes potential estimation errors. The effectiveness of the proposed M-band decomposition is demonstrated via denoising comparisons on several image types (natural, texture, seismics), with various M-band wavelets and thresholding strategies. Significant improvements in terms of both overall noise reduction and direction preservation are observed.},
  number = {8},
  journaltitle = {IEEE Transactions on Image Processing},
  date = {2006-08},
  pages = {2397-2412},
  keywords = {Algorithms,Artificial Intelligence,Discrete wavelet transforms,Hilbert transforms,Image Enhancement,Image Interpretation; Computer-Assisted,Image analysis,Image reconstruction,Image texture analysis,Information Storage and Retrieval,Noise reduction,Pattern Recognition; Automated,Wavelet analysis,image denoising,trees (mathematics),wavelet transforms,wavelets,Hilbert transform,Signal analysis,denoising comparisons,Direction selection,directional analysis,dual basis,dual-tree,dual-tree decomposition structure,dual-tree M-band wavelet transform,estimation errors,Filter bank,Hilbert wavelet pair,image analysis,image reconstruction,image representation,Numerical Analysis; Computer-Assisted,optimal signal reconstruction technique,Signal processing,Signal Processing; Computer-Assisted,thresholding strategies,two-dimensional generalization,Wavelet transforms},
  author = {Chaux, C. and Duval, L. and Pesquet, J. C.},
  file = {/Users/fergalcotter/Dropbox/Papers/Chaux et al_2006_Image analysis using a dual-tree M-band wavelet transform.pdf;/Users/fergalcotter/Zotero/storage/6LN3VBND/1658102.html}
}

@article{lee_image_1996,
  title = {Image Representation Using {{2D Gabor}} Wavelets},
  volume = {18},
  issn = {0162-8828},
  doi = {10.1109/34.541406},
  abstract = {This paper extends to two dimensions the frame criterion developed by Daubechies for one-dimensional wavelets, and it computes the frame bounds for the particular case of 2D Gabor wavelets. Completeness criteria for 2D Gabor image representations are important because of their increasing role in many computer vision applications and also in modeling biological vision, since recent neurophysiological evidence from the visual cortex of mammalian brains suggests that the filter response profiles of the main class of linearly-responding cortical neurons (called simple cells) are best modeled as a family of self-similar 2D Gabor wavelets. We therefore derive the conditions under which a set of continuous 2D Gabor wavelets will provide a complete representation of any image, and we also find self-similar wavelet parametrization which allow stable reconstruction by summation as though the wavelets formed an orthonormal basis. Approximating a “tight frame” generates redundancy which allows low-resolution neural responses to represent high-resolution images},
  number = {10},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  date = {1996-10},
  pages = {959-971},
  keywords = {2D Gabor wavelets,Application software,Biological system modeling,Biology computing,Brain modeling,Cells (biology),Continuous wavelet transforms,Gabor filters,Neurons,coarse coding,computer vision,frame bounds,frame criterion,neurophysiology,self-similar wavelet parametrization,visual cortex,wavelet transforms,Image representation,image reconstruction,image representation,Computer vision,image coding},
  author = {Lee, Tai Sing},
  file = {/Users/fergalcotter/Dropbox/Papers/Lee - 1996 - Image representation using 2D Gabor wavelets.pdf;/Users/fergalcotter/Zotero/storage/RM8YS77I/541406.html}
}

@thesis{gal_uncertainty_2016,
  location = {{Engineering Department}},
  title = {Uncertainty {{In Deep Learning}}},
  institution = {{University of Cambridge}},
  type = {PhD Thesis},
  date = {2016},
  author = {Gal, Yarin},
  file = {/Users/fergalcotter/Zotero/storage/JVM44TRF/Gal - 2016 - Uncertainty In Deep Learning.pdf}
}

@inproceedings{szegedy_intriguing_2014,
  location = {{Banff, Canada}},
  title = {Intriguing Properties of Neural Networks},
  abstract = {Deep neural networks are highly expressive models that have recently achieved state of the art performance on speech and visual recognition tasks. While their expressiveness is the reason they succeed, it also causes them to learn uninterpretable solutions that could have counter-intuitive properties. In this paper we report two such properties. First, we find that there is no distinction between individual high level units and random linear combinations of high level units, according to various methods of unit analysis. It suggests that it is the space, rather than the individual units, that contains of the semantic information in the high layers of neural networks. Second, we find that deep neural networks learn input-output mappings that are fairly discontinuous to a significant extend. We can cause the network to misclassify an image by applying a certain imperceptible perturbation, which is found by maximizing the network's prediction error. In addition, the specific nature of these perturbations is not a random artifact of learning: the same perturbation can cause a different network, that was trained on a different subset of the dataset, to misclassify the same input.},
  eventtitle = {{{ICLR}}},
  booktitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}})},
  date = {2014-04-14},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Szegedy, Christian and Zaremba, Wojciech and Sutskever, Ilya and Bruna, Joan and Erhan, Dumitru and Goodfellow, Ian and Fergus, Rob},
  file = {/Users/fergalcotter/Zotero/storage/42FF93AA/Szegedy et al. - 2013 - Intriguing properties of neural networks.pdf;/Users/fergalcotter/Zotero/storage/6RFH8LM3/1312.html}
}

@article{shaham_understanding_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1511.05432},
  primaryClass = {cs, stat},
  title = {Understanding {{Adversarial Training}}: {{Increasing Local Stability}} of {{Neural Nets}} through {{Robust Optimization}}},
  url = {http://arxiv.org/abs/1511.05432},
  shorttitle = {Understanding {{Adversarial Training}}},
  abstract = {We propose a general framework for increasing local stability of Artificial Neural Nets (ANNs) using Robust Optimization (RO). We achieve this through an alternating minimization-maximization procedure, in which the loss of the network is minimized over perturbed examples that are generated at each parameter update. We show that adversarial training of ANNs is in fact robustification of the network optimization, and that our proposed framework generalizes previous approaches for increasing local stability of ANNs. Experimental results reveal that our approach increases the robustness of the network to existing adversarial examples, while making it harder to generate new ones. Furthermore, our algorithm improves the accuracy of the network also on the original test data.},
  urldate = {2018-01-08},
  date = {2015-11-17},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  author = {Shaham, Uri and Yamada, Yutaro and Negahban, Sahand},
  file = {/Users/fergalcotter/Zotero/storage/EKZHICCE/Shaham et al. - 2015 - Understanding Adversarial Training Increasing Loc.pdf;/Users/fergalcotter/Zotero/storage/RPRIPQF8/1511.html}
}

@article{huang_learning_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1511.03034},
  primaryClass = {cs},
  title = {Learning with a {{Strong Adversary}}},
  url = {http://arxiv.org/abs/1511.03034},
  abstract = {The robustness of neural networks to intended perturbations has recently attracted significant attention. In this paper, we propose a new method, \textbackslash{}emph\{learning with a strong adversary\}, that learns robust classifiers from supervised data. The proposed method takes finding adversarial examples as an intermediate step. A new and simple way of finding adversarial examples is presented and experimentally shown to be efficient. Experimental results demonstrate that resulting learning method greatly improves the robustness of the classification models produced.},
  urldate = {2018-01-08},
  date = {2015-11-10},
  keywords = {Computer Science - Learning},
  author = {Huang, Ruitong and Xu, Bing and Schuurmans, Dale and Szepesvari, Csaba},
  file = {/Users/fergalcotter/Zotero/storage/I4VJ9A88/Huang et al. - 2015 - Learning with a Strong Adversary.pdf;/Users/fergalcotter/Zotero/storage/E8IB8R7D/1511.html}
}

@article{gu_towards_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1412.5068},
  primaryClass = {cs},
  title = {Towards {{Deep Neural Network Architectures Robust}} to {{Adversarial Examples}}},
  url = {http://arxiv.org/abs/1412.5068},
  abstract = {Recent work has shown deep neural networks (DNNs) to be highly susceptible to well-designed, small perturbations at the input layer, or so-called adversarial examples. Taking images as an example, such distortions are often imperceptible, but can result in 100\% mis-classification for a state of the art DNN. We study the structure of adversarial examples and explore network topology, pre-processing and training strategies to improve the robustness of DNNs. We perform various experiments to assess the removability of adversarial examples by corrupting with additional noise and pre-processing with denoising autoencoders (DAEs). We find that DAEs can remove substantial amounts of the adversarial noise. How- ever, when stacking the DAE with the original DNN, the resulting network can again be attacked by new adversarial examples with even smaller distortion. As a solution, we propose Deep Contractive Network, a model with a new end-to-end training procedure that includes a smoothness penalty inspired by the contractive autoencoder (CAE). This increases the network robustness to adversarial examples, without a significant performance penalty.},
  urldate = {2018-01-08},
  date = {2014-12-11},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Gu, Shixiang and Rigazio, Luca},
  file = {/Users/fergalcotter/Zotero/storage/7SW8ZJ9B/Gu and Rigazio - 2014 - Towards Deep Neural Network Architectures Robust t.pdf;/Users/fergalcotter/Zotero/storage/DFGRJ3ZQ/1412.html}
}

@thesis{bruna_scattering_2012,
  location = {{Paris, France}},
  title = {Scattering {{Representations}} for {{Recognition}}},
  abstract = {This thesis addresses the problem of pattern and texture recognition from a
mathematical perspective. These high level tasks require signal representations
enjoying specific invariance, stability and consistency properties, which
are not satisfied by linear representations.
Scattering operators cascade wavelet decompositions and complex modulus,
followed by a lowpass filtering. They define a non-linear representation which
is locally translation invariant and Lipschitz continuous to the action of diffeomorphisms.
They also define a texture representation capturing high order
moments and which can be consistently estimated from few realizations.
The thesis derives new mathematical properties of scattering representations
and demonstrates its efficiency on pattern and texture recognition tasks.
Thanks to its Lipschitz continuity to the action of diffeomorphisms, small
deformations of the signal are linearized, which can be exploited in applications
with a generative affine classifier yielding state-of-the-art results on
handwritten digit classification. Expected scattering representations are applied
on image and auditory texture datasets, showing their capacity to capture
high order moments information with consistent estimators. Scattering
representations are particularly efficient for the estimation and characterization
of fractal parameters. A renormalization of scattering coefficients is
introduced, giving a new insight on fractal description, with the ability in
particular to characterize multifractal intermittency using consistent estimators.},
  pagetotal = {200},
  institution = {{Ecole Polytechnique}},
  type = {PhD Thesis},
  date = {2012-11},
  author = {Bruna, Joan},
  file = {/Users/fergalcotter/Zotero/storage/DNUK3YE6/Bruna, Joan - 2012 - Scattering Representations for Recognition.pdf}
}

@inproceedings{huang_safety_2017,
  langid = {english},
  title = {Safety {{Verification}} of {{Deep Neural Networks}}},
  isbn = {978-3-319-63386-2 978-3-319-63387-9},
  url = {https://link.springer.com/chapter/10.1007/978-3-319-63387-9_1},
  doi = {10.1007/978-3-319-63387-9_1},
  abstract = {Deep neural networks have achieved impressive experimental results in image classification, but can surprisingly be unstable with respect to adversarial perturbations, that is, minimal changes to the input image that cause the network to misclassify it. With potential applications including perception modules and end-to-end controllers for self-driving cars, this raises concerns about their safety. We develop a novel automated verification framework for feed-forward multi-layer neural networks based on Satisfiability Modulo Theory (SMT). We focus on safety of image classification decisions with respect to image manipulations, such as scratches or changes to camera angle or lighting conditions that would result in the same class being assigned by a human, and define safety for an individual decision in terms of invariance of the classification within a small neighbourhood of the original image. We enable exhaustive search of the region by employing discretisation, and propagate the analysis layer by layer. Our method works directly with the network code and, in contrast to existing methods, can guarantee that adversarial examples, if they exist, are found for the given region and family of manipulations. If found, adversarial examples can be shown to human testers and/or used to fine-tune the network. We implement the techniques using Z3 and evaluate them on state-of-the-art networks, including regularised and deep learning networks. We also compare against existing techniques to search for adversarial examples and estimate network robustness.},
  eventtitle = {International {{Conference}} on {{Computer Aided Verification}}},
  booktitle = {Computer {{Aided Verification}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer, Cham}},
  urldate = {2018-01-05},
  date = {2017-07-24},
  pages = {3-29},
  author = {Huang, Xiaowei and Kwiatkowska, Marta and Wang, Sen and Wu, Min},
  file = {/Users/fergalcotter/Zotero/storage/CQW93XRZ/Huang et al. - 2017 - Safety Verification of Deep Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/7PBXA5X4/978-3-319-63387-9_1.html}
}

@incollection{Huang2017,
  location = {{Cham}},
  title = {Safety {{Verification}} of {{Deep Neural Networks}}},
  isbn = {978-3-319-63387-9},
  url = {https://doi.org/10.1007/978-3-319-63387-9_1},
  abstract = {Deep neural networks have achieved impressive experimental results in image classification, but can surprisingly be unstable with respect to adversarial perturbations, that is, minimal changes to the input image that cause the network to misclassify it. With potential applications including perception modules and end-to-end controllers for self-driving cars, this raises concerns about their safety. We develop a novel automated verification framework for feed-forward multi-layer neural networks based on Satisfiability Modulo Theory (SMT). We focus on safety of image classification decisions with respect to image manipulations, such as scratches or changes to camera angle or lighting conditions that would result in the same class being assigned by a human, and define safety for an individual decision in terms of invariance of the classification within a small neighbourhood of the original image. We enable exhaustive search of the region by employing discretisation, and propagate the analysis layer by layer. Our method works directly with the network code and, in contrast to existing methods, can guarantee that adversarial examples, if they exist, are found for the given region and family of manipulations. If found, adversarial examples can be shown to human testers and/or used to fine-tune the network. We implement the techniques using Z3 and evaluate them on state-of-the-art networks, including regularised and deep learning networks. We also compare against existing techniques to search for adversarial examples and estimate network robustness.},
  booktitle = {Computer {{Aided Verification}}: 29th {{International Conference}}, {{CAV}} 2017, {{Heidelberg}}, {{Germany}}, {{July}} 24-28, 2017, {{Proceedings}}, {{Part I}}},
  publisher = {{Springer International Publishing}},
  date = {2017},
  pages = {3-29},
  author = {Huang, Xiaowei and Kwiatkowska, Marta and Wang, Sen and Wu, Min},
  editor = {Majumdar, Rupak and Kuňcak, Viktor}
}

@article{brown_adversarial_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1712.09665},
  primaryClass = {cs},
  title = {Adversarial {{Patch}}},
  url = {http://arxiv.org/abs/1712.09665},
  abstract = {We present a method to create universal, robust, targeted adversarial image patches in the real world. The patches are universal because they can be used to attack any scene, robust because they work under a wide variety of transformations, and targeted because they can cause a classifier to output any target class. These adversarial patches can be printed, added to any scene, photographed, and presented to image classifiers; even when the patches are small, they cause the classifiers to ignore the other items in the scene and report a chosen target class.},
  urldate = {2018-01-04},
  date = {2017-12-27},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Brown, Tom B. and Mané, Dandelion and Roy, Aurko and Abadi, Martín and Gilmer, Justin},
  file = {/Users/fergalcotter/Zotero/storage/8EXPJYJX/Brown et al. - 2017 - Adversarial Patch.pdf;/Users/fergalcotter/Zotero/storage/WCGNG5HB/1712.html}
}

@article{huang_safety_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1610.06940},
  primaryClass = {cs, stat},
  title = {Safety {{Verification}} of {{Deep Neural Networks}}},
  url = {http://arxiv.org/abs/1610.06940},
  abstract = {Deep neural networks have achieved impressive experimental results in image classification, but can surprisingly be unstable with respect to adversarial perturbations, that is, minimal changes to the input image that cause the network to misclassify it. With potential applications including perception modules and end-to-end controllers for self-driving cars, this raises concerns about their safety. We develop a novel automated verification framework for feed-forward multi-layer neural networks based on Satisfiability Modulo Theory (SMT). We focus on safety of image classification decisions with respect to image manipulations, such as scratches or changes to camera angle or lighting conditions that would result in the same class being assigned by a human, and define safety for an individual decision in terms of invariance of the classification within a small neighbourhood of the original image. We enable exhaustive search of the region by employing discretisation, and propagate the analysis layer by layer. Our method works directly with the network code and, in contrast to existing methods, can guarantee that adversarial examples, if they exist, are found for the given region and family of manipulations. If found, adversarial examples can be shown to human testers and/or used to fine-tune the network. We implement the techniques using Z3 and evaluate them on state-of-the-art networks, including regularised and deep learning networks. We also compare against existing techniques to search for adversarial examples and estimate network robustness.},
  urldate = {2018-01-04},
  date = {2016-10-21},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Learning,Statistics - Machine Learning},
  author = {Huang, Xiaowei and Kwiatkowska, Marta and Wang, Sen and Wu, Min},
  file = {/Users/fergalcotter/Zotero/storage/2SWCVNJL/Huang et al. - 2016 - Safety Verification of Deep Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/HXCPPS9J/1610.html}
}

@article{bastani_measuring_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1605.07262},
  primaryClass = {cs},
  title = {Measuring {{Neural Net Robustness}} with {{Constraints}}},
  url = {http://arxiv.org/abs/1605.07262},
  abstract = {Despite having high accuracy, neural nets have been shown to be susceptible to adversarial examples, where a small perturbation to an input can cause it to become mislabeled. We propose metrics for measuring the robustness of a neural net and devise a novel algorithm for approximating these metrics based on an encoding of robustness as a linear program. We show how our metrics can be used to evaluate the robustness of deep neural nets with experiments on the MNIST and CIFAR-10 datasets. Our algorithm generates more informative estimates of robustness metrics compared to estimates based on existing algorithms. Furthermore, we show how existing approaches to improving robustness "overfit" to adversarial examples generated using a specific algorithm. Finally, we show that our techniques can be used to additionally improve neural net robustness both according to the metrics that we propose, but also according to previously proposed metrics.},
  urldate = {2018-01-04},
  date = {2016-05-23},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Bastani, Osbert and Ioannou, Yani and Lampropoulos, Leonidas and Vytiniotis, Dimitrios and Nori, Aditya and Criminisi, Antonio},
  file = {/Users/fergalcotter/Zotero/storage/X5V7FHZB/Bastani et al. - 2016 - Measuring Neural Net Robustness with Constraints.pdf;/Users/fergalcotter/Zotero/storage/P99EC9BQ/1605.html}
}

@article{hinton_distilling_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1503.02531},
  primaryClass = {cs, stat},
  title = {Distilling the {{Knowledge}} in a {{Neural Network}}},
  url = {http://arxiv.org/abs/1503.02531},
  abstract = {A very simple way to improve the performance of almost any machine learning algorithm is to train many different models on the same data and then to average their predictions. Unfortunately, making predictions using a whole ensemble of models is cumbersome and may be too computationally expensive to allow deployment to a large number of users, especially if the individual models are large neural nets. Caruana and his collaborators have shown that it is possible to compress the knowledge in an ensemble into a single model which is much easier to deploy and we develop this approach further using a different compression technique. We achieve some surprising results on MNIST and we show that we can significantly improve the acoustic model of a heavily used commercial system by distilling the knowledge in an ensemble of models into a single model. We also introduce a new type of ensemble composed of one or more full models and many specialist models which learn to distinguish fine-grained classes that the full models confuse. Unlike a mixture of experts, these specialist models can be trained rapidly and in parallel.},
  urldate = {2018-01-03},
  date = {2015-03-09},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  author = {Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
  file = {/Users/fergalcotter/Zotero/storage/2VGMIP5K/Hinton et al. - 2015 - Distilling the Knowledge in a Neural Network.pdf;/Users/fergalcotter/Zotero/storage/5M53I3PM/1503.html}
}

@article{papernot_distillation_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1511.04508},
  primaryClass = {cs, stat},
  title = {Distillation as a {{Defense}} to {{Adversarial Perturbations}} against {{Deep Neural Networks}}},
  url = {http://arxiv.org/abs/1511.04508},
  abstract = {Deep learning algorithms have been shown to perform extremely well on many classical machine learning problems. However, recent studies have shown that deep learning, like other machine learning techniques, is vulnerable to adversarial samples: inputs crafted to force a deep neural network (DNN) to provide adversary-selected outputs. Such attacks can seriously undermine the security of the system supported by the DNN, sometimes with devastating consequences. For example, autonomous vehicles can be crashed, illicit or illegal content can bypass content filters, or biometric authentication systems can be manipulated to allow improper access. In this work, we introduce a defensive mechanism called defensive distillation to reduce the effectiveness of adversarial samples on DNNs. We analytically investigate the generalizability and robustness properties granted by the use of defensive distillation when training DNNs. We also empirically study the effectiveness of our defense mechanisms on two DNNs placed in adversarial settings. The study shows that defensive distillation can reduce effectiveness of sample creation from 95\% to less than 0.5\% on a studied DNN. Such dramatic gains can be explained by the fact that distillation leads gradients used in adversarial sample creation to be reduced by a factor of 10\^30. We also find that distillation increases the average minimum number of features that need to be modified to create adversarial samples by about 800\% on one of the DNNs we tested.},
  urldate = {2018-01-03},
  date = {2015-11-13},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning,Computer Science - Cryptography and Security},
  author = {Papernot, Nicolas and McDaniel, Patrick and Wu, Xi and Jha, Somesh and Swami, Ananthram},
  file = {/Users/fergalcotter/Zotero/storage/SN4BNW9X/Papernot et al. - 2015 - Distillation as a Defense to Adversarial Perturbat.pdf;/Users/fergalcotter/Zotero/storage/XKJMAJM9/1511.html}
}

@inproceedings{carlini_towards_2017,
  title = {Towards {{Evaluating}} the {{Robustness}} of {{Neural Networks}}},
  doi = {10.1109/SP.2017.49},
  abstract = {Neural networks provide state-of-the-art results for most machine learning tasks. Unfortunately, neural networks are vulnerable to adversarial examples: given an input x and any target classification t, it is possible to find a new input x' that is similar to x but classified as t. This makes it difficult to apply neural networks in security-critical areas. Defensive distillation is a recently proposed approach that can take an arbitrary neural network, and increase its robustness, reducing the success rate of current attacks' ability to find adversarial examples from 95\% to 0.5\%. In this paper, we demonstrate that defensive distillation does not significantly increase the robustness of neural networks by introducing three new attack algorithms that are successful on both distilled and undistilled neural networks with 100\% probability. Our attacks are tailored to three distance metrics used previously in the literature, and when compared to previous adversarial example generation algorithms, our attacks are often much more effective (and never worse). Furthermore, we propose using high-confidence adversarial examples in a simple transferability test we show can also be used to break defensive distillation. We hope our attacks will be used as a benchmark in future defense attempts to create neural networks that resist adversarial examples.},
  eventtitle = {2017 {{IEEE Symposium}} on {{Security}} and {{Privacy}} ({{SP}})},
  booktitle = {2017 {{IEEE Symposium}} on {{Security}} and {{Privacy}} ({{SP}})},
  date = {2017-05},
  pages = {39-57},
  keywords = {Neural networks,Robustness,machine learning,neural nets,attack algorithms,defensive distillation,distance metrics,high-confidence adversarial examples,Malware,Measurement,neural networks,Resists,Security,security of data,Speech recognition,transferability test},
  author = {Carlini, N. and Wagner, D.},
  file = {/Users/fergalcotter/Zotero/storage/TY58CICY/Carlini and Wagner - 2017 - Towards Evaluating the Robustness of Neural Networ.pdf;/Users/fergalcotter/Zotero/storage/D8BYUIM7/7958570.html}
}

@article{liu_delving_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1611.02770},
  primaryClass = {cs},
  title = {Delving into {{Transferable Adversarial Examples}} and {{Black}}-Box {{Attacks}}},
  url = {http://arxiv.org/abs/1611.02770},
  abstract = {An intriguing property of deep neural networks is the existence of adversarial examples, which can transfer among different architectures. These transferable adversarial examples may severely hinder deep neural network-based applications. Previous works mostly study the transferability using small scale datasets. In this work, we are the first to conduct an extensive study of the transferability over large models and a large scale dataset, and we are also the first to study the transferability of targeted adversarial examples with their target labels. We study both non-targeted and targeted adversarial examples, and show that while transferable non-targeted adversarial examples are easy to find, targeted adversarial examples generated using existing approaches almost never transfer with their target labels. Therefore, we propose novel ensemble-based approaches to generating transferable adversarial examples. Using such approaches, we observe a large proportion of targeted adversarial examples that are able to transfer with their target labels for the first time. We also present some geometric studies to help understanding the transferable adversarial examples. Finally, we show that the adversarial examples generated using ensemble-based approaches can successfully attack Clarifai.com, which is a black-box image classification system.},
  urldate = {2018-01-03},
  date = {2016-11-08},
  keywords = {Computer Science - Learning},
  author = {Liu, Yanpei and Chen, Xinyun and Liu, Chang and Song, Dawn},
  file = {/Users/fergalcotter/Zotero/storage/UKPHHBR9/Liu et al. - 2016 - Delving into Transferable Adversarial Examples and.pdf;/Users/fergalcotter/Zotero/storage/7V6GQN5M/1611.html}
}

@inproceedings{liu_robust_2017,
  location = {{Dallas, Texas}},
  title = {Robust {{Linear Regression Against Training Data Poisoning}}},
  volume = {10},
  url = {http://vorobeychik.com/2017/poisonlr.pdf},
  abstract = {The effectiveness of supervised learning techniques has made them
ubiquitous in research and practice. In high-dimensional settings,
supervised learning commonly relies on dimensionality reduction
to improve performance and identify the most important factors in
predicting outcomes. However, the economic importance of learning
has made it a natural target for adversarial manipulation of
training data, which we term poisoning attacks. Prior approaches to
dealing with robust supervised learning rely on strong assumptions
about the nature of the feature matrix, such as feature independence
and sub-Gaussian noise with low variance. We propose an integrated
method for robust regression that relaxes these assumptions,
assuming only that the feature matrix can be well approximated
by a low-rank matrix. Our techniques integrate improved robust
low-rank matrix approximation and robust principle component
regression, and yield strong performance guarantees. Moreover,
we experimentally show that our methods significantly outperform
state-of-the-art robust regression both in running time and
prediction error},
  booktitle = {Proceedings of the 2017 {{ACM Workshop}} on {{Artificial Intelligence}} and {{Security}}},
  series = {{{AISec}} '17},
  publisher = {{ACM}},
  date = {2017-11},
  author = {Liu, Chang and Li, Bo and Vorobeychik, Yeveniy and Oprea, Alina},
  file = {/Users/fergalcotter/Zotero/storage/SMREUINL/Chang Liu et al. - 2017 - Robust Linear Regression Against Training Data Poi.pdf}
}

@inproceedings{russu_secure_2016,
  location = {{New York, NY, USA}},
  title = {Secure {{Kernel Machines Against Evasion Attacks}}},
  isbn = {978-1-4503-4573-6},
  url = {http://doi.acm.org/10.1145/2996758.2996771},
  doi = {10.1145/2996758.2996771},
  abstract = {Machine learning is widely used in security-sensitive settings like spam and malware detection, although it has been shown that malicious data can be carefully modified at test time to evade detection. To overcome this limitation, adversary-aware learning algorithms have been developed, exploiting robust optimization and game-theoretical models to incorporate knowledge of potential adversarial data manipulations into the learning algorithm. Despite these techniques have been shown to be effective in some adversarial learning tasks, their adoption in practice is hindered by different factors, including the difficulty of meeting specific theoretical requirements, the complexity of implementation, and scalability issues, in terms of computational time and space required during training. In this work, we aim to develop secure kernel machines against evasion attacks that are not computationally more demanding than their non-secure counterparts. In particular, leveraging recent work on robustness and regularization, we show that the security of a linear classifier can be drastically improved by selecting a proper regularizer, depending on the kind of evasion attack, as well as unbalancing the cost of classification errors. We then discuss the security of nonlinear kernel machines, and show that a proper choice of the kernel function is crucial. We also show that unbalancing the cost of classification errors and varying some kernel parameters can further improve classifier security, yielding decision functions that better enclose the legitimate data. Our results on spam and PDF malware detection corroborate our analysis.},
  booktitle = {Proceedings of the 2016 {{ACM Workshop}} on {{Artificial Intelligence}} and {{Security}}},
  series = {{{AISec}} '16},
  publisher = {{ACM}},
  urldate = {2018-01-03},
  date = {2016},
  pages = {59--69},
  keywords = {adversarial machine learning,evasion attacks,kernel methods,secure learning},
  author = {Russu, Paolo and Demontis, Ambra and Biggio, Battista and Fumera, Giorgio and Roli, Fabio},
  file = {/Users/fergalcotter/Zotero/storage/75SAQUD6/Russu et al. - 2016 - Secure Kernel Machines Against Evasion Attacks.pdf}
}

@article{zantedeschi_efficient_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1707.06728},
  primaryClass = {cs},
  title = {Efficient {{Defenses Against Adversarial Attacks}}},
  url = {http://arxiv.org/abs/1707.06728},
  abstract = {Following the recent adoption of deep neural networks (DNN) accross a wide range of applications, adversarial attacks against these models have proven to be an indisputable threat. Adversarial samples are crafted with a deliberate intention of undermining a system. In the case of DNNs, the lack of better understanding of their working has prevented the development of efficient defenses. In this paper, we propose a new defense method based on practical observations which is easy to integrate into models and performs better than state-of-the-art defenses. Our proposed solution is meant to reinforce the structure of a DNN, making its prediction more stable and less likely to be fooled by adversarial samples. We conduct an extensive experimental study proving the efficiency of our method against multiple attacks, comparing it to numerous defenses, both in white-box and black-box setups. Additionally, the implementation of our method brings almost no overhead to the training procedure, while maintaining the prediction performance of the original model on clean samples.},
  urldate = {2018-01-03},
  date = {2017-07-20},
  keywords = {Computer Science - Learning},
  author = {Zantedeschi, Valentina and Nicolae, Maria-Irina and Rawat, Ambrish},
  file = {/Users/fergalcotter/Zotero/storage/S4596H9I/Zantedeschi et al. - 2017 - Efficient Defenses Against Adversarial Attacks.pdf;/Users/fergalcotter/Zotero/storage/IXVQP4LB/1707.html}
}

@article{munoz-gonzalez_towards_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1708.08689},
  primaryClass = {cs},
  title = {Towards {{Poisoning}} of {{Deep Learning Algorithms}} with {{Back}}-Gradient {{Optimization}}},
  url = {http://arxiv.org/abs/1708.08689},
  abstract = {A number of online services nowadays rely upon machine learning to extract valuable information from data collected in the wild. This exposes learning algorithms to the threat of data poisoning, i.e., a coordinate attack in which a fraction of the training data is controlled by the attacker and manipulated to subvert the learning process. To date, these attacks have been devised only against a limited class of binary learning algorithms, due to the inherent complexity of the gradient-based procedure used to optimize the poisoning points (a.k.a. adversarial training examples). In this work, we rst extend the de nition of poisoning attacks to multiclass problems. We then propose a novel poisoning algorithm based on the idea of back-gradient optimization, i.e., to compute the gradient of interest through automatic di erentiation, while also reversing the learning procedure to drastically reduce the attack complexity. Compared to current poisoning strategies, our approach is able to target a wider class of learning algorithms, trained with gradient- based procedures, including neural networks and deep learning architectures. We empirically evaluate its e ectiveness on several application examples, including spam ltering, malware detection, and handwritten digit recognition. We nally show that, similarly to adversarial test examples, adversarial training examples can also be transferred across di erent learning algorithms.},
  urldate = {2018-01-03},
  date = {2017-08-29},
  keywords = {Computer Science - Learning},
  author = {Muñoz-González, Luis and Biggio, Battista and Demontis, Ambra and Paudice, Andrea and Wongrassamee, Vasin and Lupu, Emil C. and Roli, Fabio},
  file = {/Users/fergalcotter/Zotero/storage/TQ7KSE4X/Muñoz-González et al. - 2017 - Towards Poisoning of Deep Learning Algorithms with.pdf;/Users/fergalcotter/Zotero/storage/UTJID4ES/1708.html}
}

@article{carlini_adversarial_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.07263},
  primaryClass = {cs},
  title = {Adversarial {{Examples Are Not Easily Detected}}: {{Bypassing Ten Detection Methods}}},
  url = {http://arxiv.org/abs/1705.07263},
  shorttitle = {Adversarial {{Examples Are Not Easily Detected}}},
  abstract = {Neural networks are known to be vulnerable to adversarial examples: inputs that are close to natural inputs but classified incorrectly. In order to better understand the space of adversarial examples, we survey ten recent proposals that are designed for detection and compare their efficacy. We show that all can be defeated by constructing new loss functions. We conclude that adversarial examples are significantly harder to detect than previously appreciated, and the properties believed to be intrinsic to adversarial examples are in fact not. Finally, we propose several simple guidelines for evaluating future proposed defenses.},
  urldate = {2018-01-03},
  date = {2017-05-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Cryptography and Security},
  author = {Carlini, Nicholas and Wagner, David},
  file = {/Users/fergalcotter/Zotero/storage/XJ7UZMDA/Carlini and Wagner - 2017 - Adversarial Examples Are Not Easily Detected Bypa.pdf;/Users/fergalcotter/Zotero/storage/X7CDGIWB/1705.html}
}

@article{song_machine_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1709.07886},
  primaryClass = {cs},
  title = {Machine {{Learning Models}} That {{Remember Too Much}}},
  url = {http://arxiv.org/abs/1709.07886},
  abstract = {Machine learning (ML) is becoming a commodity. Numerous ML frameworks and services are available to data holders who are not ML experts but want to train predictive models on their data. It is important that ML models trained on sensitive inputs (e.g., personal images or documents) not leak too much information about the training data. We consider a malicious ML provider who supplies model-training code to the data holder, does not observe the training, but then obtains white- or black-box access to the resulting model. In this setting, we design and implement practical algorithms, some of them very similar to standard ML techniques such as regularization and data augmentation, that "memorize" information about the training dataset in the model yet the model is as accurate and predictive as a conventionally trained model. We then explain how the adversary can extract memorized information from the model. We evaluate our techniques on standard ML tasks for image classification (CIFAR10), face recognition (LFW and FaceScrub), and text analysis (20 Newsgroups and IMDB). In all cases, we show how our algorithms create models that have high predictive power yet allow accurate extraction of subsets of their training data.},
  urldate = {2018-01-03},
  date = {2017-09-22},
  keywords = {Computer Science - Learning,Computer Science - Cryptography and Security},
  author = {Song, Congzheng and Ristenpart, Thomas and Shmatikov, Vitaly},
  file = {/Users/fergalcotter/Zotero/storage/I49L9ATW/Song et al. - 2017 - Machine Learning Models that Remember Too Much.pdf;/Users/fergalcotter/Zotero/storage/5CL7G62W/1709.html}
}

@article{dang_evading_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.07535},
  primaryClass = {cs},
  title = {Evading {{Classifiers}} by {{Morphing}} in the {{Dark}}},
  url = {http://arxiv.org/abs/1705.07535},
  doi = {10.1145/3133956.3133978},
  abstract = {Learning-based systems have been shown to be vulnerable to evasion through adversarial data manipulation. These attacks have been studied under assumptions that the adversary has certain knowledge of either the target model internals, its training dataset or at least classification scores it assigns to input samples. In this paper, we investigate a much more constrained and realistic attack scenario wherein the target classifier is minimally exposed to the adversary, revealing on its final classification decision (e.g., reject or accept an input sample). Moreover, the adversary can only manipulate malicious samples using a blackbox morpher. That is, the adversary has to evade the target classifier by morphing malicious samples "in the dark". We present a scoring mechanism that can assign a real-value score which reflects evasion progress to each sample based on the limited information available. Leveraging on such scoring mechanism, we propose an evasion method -- EvadeHC -- and evaluate it against two PDF malware detectors, namely PDFRate and Hidost. The experimental evaluation demonstrates that the proposed evasion attacks are effective, attaining \$100\textbackslash\%\$ evasion rate on the evaluation dataset. Interestingly, EvadeHC outperforms the known classifier evasion technique that operates based on classification scores output by the classifiers. Although our evaluations are conducted on PDF malware classifier, the proposed approaches are domain-agnostic and is of wider application to other learning-based systems.},
  urldate = {2018-01-03},
  date = {2017},
  pages = {119-133},
  keywords = {Computer Science - Cryptography and Security},
  author = {Dang, Hung and Huang, Yue and Chang, Ee-Chien},
  file = {/Users/fergalcotter/Zotero/storage/4AJRS6TJ/Dang et al. - 2017 - Evading Classifiers by Morphing in the Dark.pdf;/Users/fergalcotter/Zotero/storage/5JWDFWTV/1705.html}
}

@article{meng_magnet:_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.09064},
  primaryClass = {cs},
  title = {{{MagNet}}: A {{Two}}-{{Pronged Defense}} against {{Adversarial Examples}}},
  url = {http://arxiv.org/abs/1705.09064},
  shorttitle = {{{MagNet}}},
  abstract = {Deep learning has shown promising results on hard perceptual problems in recent years. However, deep learning systems are found to be vulnerable to small adversarial perturbations that are nearly imperceptible to human. Such specially crafted perturbations cause deep learning systems to output incorrect decisions, with potentially disastrous consequences. These vulnerabilities hinder the deployment of deep learning systems where safety or security is important. Attempts to secure deep learning systems either target specific attacks or have been shown to be ineffective. In this paper, we propose MagNet, a framework for defending neural network classifiers against adversarial examples. MagNet does not modify the protected classifier or know the process for generating adversarial examples. MagNet includes one or more separate detector networks and a reformer network. Different from previous work, MagNet learns to differentiate between normal and adversarial examples by approximating the manifold of normal examples. Since it does not rely on any process for generating adversarial examples, it has substantial generalization power. Moreover, MagNet reconstructs adversarial examples by moving them towards the manifold, which is effective for helping classify adversarial examples with small perturbation correctly. We discuss the intrinsic difficulty in defending against whitebox attack and propose a mechanism to defend against graybox attack. Inspired by the use of randomness in cryptography, we propose to use diversity to strengthen MagNet. We show empirically that MagNet is effective against most advanced state-of-the-art attacks in blackbox and graybox scenarios while keeping false positive rate on normal examples very low.},
  urldate = {2018-01-03},
  date = {2017-05-25},
  keywords = {Computer Science - Learning,Computer Science - Cryptography and Security},
  author = {Meng, Dongyu and Chen, Hao},
  file = {/Users/fergalcotter/Zotero/storage/K8M88MA2/Meng and Chen - 2017 - MagNet a Two-Pronged Defense against Adversarial .pdf;/Users/fergalcotter/Zotero/storage/ICPZ63FM/1705.html}
}

@article{chen_zoo:_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1708.03999},
  primaryClass = {cs, stat},
  title = {{{ZOO}}: {{Zeroth Order Optimization}} Based {{Black}}-Box {{Attacks}} to {{Deep Neural Networks}} without {{Training Substitute Models}}},
  url = {http://arxiv.org/abs/1708.03999},
  doi = {10.1145/3128572.3140448},
  shorttitle = {{{ZOO}}},
  abstract = {Deep neural networks (DNNs) are one of the most prominent technologies of our time, as they achieve state-of-the-art performance in many machine learning tasks, including but not limited to image classification, text mining, and speech processing. However, recent research on DNNs has indicated ever-increasing concern on the robustness to adversarial examples, especially for security-critical tasks such as traffic sign identification for autonomous driving. Studies have unveiled the vulnerability of a well-trained DNN by demonstrating the ability of generating barely noticeable (to both human and machines) adversarial images that lead to misclassification. Furthermore, researchers have shown that these adversarial images are highly transferable by simply training and attacking a substitute model built upon the target model, known as a black-box attack to DNNs. Similar to the setting of training substitute models, in this paper we propose an effective black-box attack that also only has access to the input (images) and the output (confidence scores) of a targeted DNN. However, different from leveraging attack transferability from substitute models, we propose zeroth order optimization (ZOO) based attacks to directly estimate the gradients of the targeted DNN for generating adversarial examples. We use zeroth order stochastic coordinate descent along with dimension reduction, hierarchical attack and importance sampling techniques to efficiently attack black-box models. By exploiting zeroth order optimization, improved attacks to the targeted DNN can be accomplished, sparing the need for training substitute models and avoiding the loss in attack transferability. Experimental results on MNIST, CIFAR10 and ImageNet show that the proposed ZOO attack is as effective as the state-of-the-art white-box attack and significantly outperforms existing black-box attacks via substitute models.},
  urldate = {2018-01-03},
  date = {2017},
  pages = {15-26},
  keywords = {Computer Science - Learning,Statistics - Machine Learning,Computer Science - Cryptography and Security},
  author = {Chen, Pin-Yu and Zhang, Huan and Sharma, Yash and Yi, Jinfeng and Hsieh, Cho-Jui},
  file = {/Users/fergalcotter/Zotero/storage/KUPA2C49/Chen et al. - 2017 - ZOO Zeroth Order Optimization based Black-box Att.pdf;/Users/fergalcotter/Zotero/storage/YH5ZQ2WM/1708.html}
}

@thesis{Ioannou2017thesis,
  location = {{Cambridge, UK}},
  title = {Structural {{Priors}} in {{Deep Neural Networks}}},
  institution = {{University of Cambridge}},
  type = {PhD Thesis},
  date = {2017-09},
  author = {Ioannou, Yani A.},
  file = {/Users/fergalcotter/Dropbox/Papers/Ioannou_2017_Structural Priors in Deep Neural Networks.pdf}
}

@article{sendur_bivariate_2002,
  title = {Bivariate Shrinkage with Local Variance Estimation},
  volume = {9},
  issn = {1070-9908},
  doi = {10.1109/LSP.2002.806054},
  abstract = {The performance of image-denoising algorithms using wavelet transforms can be improved significantly by taking into account the statistical dependencies among wavelet coefficients as demonstrated by several algorithms presented in the literature. In two earlier papers by the authors, a simple bivariate shrinkage rule is described using a coefficient and its parent. The performance can also be improved using simple models by estimating model parameters in a local neighborhood. This letter presents a locally adaptive denoising algorithm using the bivariate shrinkage function. The algorithm is illustrated using both the orthogonal and dual tree complex wavelet transforms. Some comparisons with the best available results are given in order to illustrate the effectiveness of the proposed algorithm.},
  number = {12},
  journaltitle = {IEEE Signal Processing Letters},
  date = {2002-12},
  pages = {438-441},
  keywords = {Equations,Noise reduction,Parameter estimation,Wavelet coefficients,image denoising,orthogonal wavelet transforms,statistical analysis,wavelet transforms,PSNR,Wavelet transforms,adaptive estimation,Adaptive estimation,adaptive signal processing,bivariate shrinkage,Computational efficiency,dual tree complex wavelet transforms,Image denoising,image-denoising algorithms,local neighborhood,local variance estimation,locally adaptive denoising algorithm,model parameters,Probability density function,statistical dependencies,wavelet coefficients},
  author = {Sendur, L. and Selesnick, I. W.},
  file = {/Users/fergalcotter/Zotero/storage/N84S3IX2/Sendur and Selesnick - 2002 - Bivariate shrinkage with local variance estimation.pdf;/Users/fergalcotter/Zotero/storage/7TG9VHMZ/1159633.html}
}

@article{donoho_ideal_1994,
  langid = {english},
  title = {Ideal Spatial Adaptation by Wavelet Shrinkage},
  volume = {81},
  issn = {0006-3444},
  url = {https://academic.oup.com/biomet/article/81/3/425/256924},
  doi = {10.1093/biomet/81.3.425},
  abstract = {SUMMARYWith ideal spatial adaptation, an oracle furnishes information about how best to adapt a spatially variable estimator, whether piecewise constant, piecewise polynomial, variable knot spline, or variable bandwidth kernel, to the unknown function. Estimation with the aid of an oracle offers dramatic advantages over traditional linear estimation by nonadaptive kernels; however, it is a priori unclear whether such performance can be obtained by a procedure relying on the data alone. We describe a new principle for spatially-adaptive estimation: selective wavelet reconstruction. We show that variable-knot spline fits and piecewise-polynomial fits, when equipped with an oracle to select the knots, are not dramatically more powerful than selective wavelet reconstruction with an oracle. We develop a practical spatially adaptive method, Risk Shrink, which works by shrinkage of empirical wavelet coefficients. RiskShrink mimics the performance of an oracle for selective wavelet reconstruction as well as it is possible to do so. A new inequality in multivariate normal decision theory which we call the oracle inequality shows that attained performance differs from ideal performance by at most a factor of approximately 2 log n, where n is the sample size. Moreover no estimator can give a better guarantee than this. Within the class of spatially adaptive procedures, RiskShrink is essentially optimal. Relying only on the data, it comes within a factor log2n of the performance of piecewise polynomial and variableknot spline methods equipped with an oracle. In contrast, it is unknown how or if piecewise polynomial methods could be made to function this well when denied access to an oracle and forced to rely on data alone.},
  number = {3},
  journaltitle = {Biometrika},
  shortjournal = {Biometrika},
  urldate = {2018-01-30},
  date = {1994-09-01},
  pages = {425-455},
  author = {Donoho, David L. and Johnstone, Jain M.},
  file = {/Users/fergalcotter/Zotero/storage/62WCR68A/Donoho and Johnstone - 1994 - Ideal spatial adaptation by wavelet shrinkage.pdf;/Users/fergalcotter/Zotero/storage/23VI9XQG/256924.html}
}

@inproceedings{moosavi-dezfooli_universal_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1610.08401},
  location = {{Honolulu, HI, USA}},
  title = {Universal Adversarial Perturbations},
  url = {http://arxiv.org/abs/1610.08401},
  abstract = {Given a state-of-the-art deep neural network classifier, we show the existence of a universal (image-agnostic) and very small perturbation vector that causes natural images to be misclassified with high probability. We propose a systematic algorithm for computing universal perturbations, and show that state-of-the-art deep neural networks are highly vulnerable to such perturbations, albeit being quasi-imperceptible to the human eye. We further empirically analyze these universal perturbations and show, in particular, that they generalize very well across neural networks. The surprising existence of universal perturbations reveals important geometric correlations among the high-dimensional decision boundary of classifiers. It further outlines potential security breaches with the existence of single directions in the input space that adversaries can possibly exploit to break a classifier on most natural images.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2017 {{IEEE Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2018-02-01},
  date = {2017-07},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Statistics - Machine Learning},
  author = {Moosavi-Dezfooli, Seyed-Mohsen and Fawzi, Alhussein and Fawzi, Omar and Frossard, Pascal},
  file = {/Users/fergalcotter/Dropbox/Papers/Moosavi-Dezfooli et al_2016_Universal adversarial perturbations.pdf;/Users/fergalcotter/Zotero/storage/Z6L2B8UT/1610.html},
  note = {00106}
}

@article{chang_adaptive_2000,
  title = {Adaptive Wavelet Thresholding for Image Denoising and Compression},
  volume = {9},
  issn = {1057-7149},
  doi = {10.1109/83.862633},
  abstract = {The first part of this paper proposes an adaptive, data-driven threshold for image denoising via wavelet soft-thresholding. The threshold is derived in a Bayesian framework, and the prior used on the wavelet coefficients is the generalized Gaussian distribution (GGD) widely used in image processing applications. The proposed threshold is simple and closed-form, and it is adaptive to each subband because it depends on data-driven estimates of the parameters. Experimental results show that the proposed method, called BayesShrink, is typically within 5\% of the MSE of the best soft-thresholding benchmark with the image assumed known. It also outperforms SureShrink (Donoho and Johnstone 1994, 1995; Donoho 1995) most of the time. The second part of the paper attempts to further validate claims that lossy compression can be used for denoising. The BayesShrink threshold can aid in the parameter selection of a coder designed with the intention of denoising, and thus achieving simultaneous denoising and compression. Specifically, the zero-zone in the quantization step of compression is analogous to the threshold value in the thresholding function. The remaining coder design parameters are chosen based on a criterion derived from Rissanen's minimum description length (MDL) principle. Experiments show that this compression method does indeed remove noise significantly, especially for large noise power. However, it introduces quantization noise and should be used only if bitrate were an additional concern to denoising},
  number = {9},
  journaltitle = {IEEE Transactions on Image Processing},
  date = {2000-09},
  pages = {1532-1546},
  keywords = {Bayes methods,Bayesian methods,Noise reduction,Parameter estimation,Wavelet coefficients,data compression,image denoising,image restoration,wavelet transforms,Bit rate,Image coding,Wavelet transforms,image coding,adaptive signal processing,Image denoising,wavelet coefficients,BayesShrink,adaptive data-driven threshold,adaptive wavelet thresholding,Bayesian framework,compression,Gaussian distribution,generalized Gaussian distribution,Image processing,lossy compression,MDL principle,parameter selection,Quantization,quantization noise,quantization step,Rissanen minimum description length principle,wavelet soft-thresholding,zero-zone,Adaptive signal processing,Data compression,Image restoration},
  author = {Chang, S. G. and Yu, Bin and Vetterli, M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Chang et al_2000_Adaptive wavelet thresholding for image denoising and compression.pdf;/Users/fergalcotter/Dropbox/Papers/Chang et al_2000_Adaptive wavelet thresholding for image denoising and compression2.pdf;/Users/fergalcotter/Zotero/storage/2EMU69N3/862633.html;/Users/fergalcotter/Zotero/storage/9NE7DCB6/862633.html}
}

@article{van_laarhoven_l2_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1706.05350},
  primaryClass = {cs, stat},
  title = {L2 {{Regularization}} versus {{Batch}} and {{Weight Normalization}}},
  url = {http://arxiv.org/abs/1706.05350},
  abstract = {Batch Normalization is a commonly used trick to improve the training of deep neural networks. These neural networks use L2 regularization, also called weight decay, ostensibly to prevent overfitting. However, we show that L2 regularization has no regularizing effect when combined with normalization. Instead, regularization has an influence on the scale of weights, and thereby on the effective learning rate. We investigate this dependence, both in theory, and experimentally. We show that popular optimization methods such as ADAM only partially eliminate the influence of normalization on the learning rate. This leads to a discussion on other ways to mitigate this issue.},
  urldate = {2018-02-10},
  date = {2017-06-16},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {van Laarhoven, Twan},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Zotero/storage/UBPHRR5L/van Laarhoven - 2017 - L2 Regularization versus Batch and Weight Normaliz.pdf;/Users/fergalcotter/Zotero/storage/J66TRTLE/1706.html},
  note = {00000}
}

@software{_intel_????,
  title = {Intel {{MKL Developer Guide}}},
  file = {/Users/fergalcotter/Dropbox/Papers/mkl_2018_lnx_devguide.pdf},
  note = {00000}
}

@software{_intel_????-1,
  title = {Intel {{MKL Developer Reference}}},
  file = {/Users/fergalcotter/Dropbox/Papers/mkl_developer_ref.pdf},
  note = {00000}
}

@article{jordan_introduction_1999,
  langid = {english},
  title = {An {{Introduction}} to {{Variational Methods}} for {{Graphical Models}}},
  volume = {37},
  issn = {0885-6125, 1573-0565},
  url = {https://link.springer.com/article/10.1023/A:1007665907178},
  doi = {10.1023/A:1007665907178},
  abstract = {This paper presents a tutorial introduction to the use of variational methods for inference and learning in graphical models (Bayesian networks and Markov random fields). We present a number of examples of graphical models, including the QMR-DT database, the sigmoid belief network, the Boltzmann machine, and several variants of hidden Markov models, in which it is infeasible to run exact inference algorithms. We then introduce variational methods, which exploit laws of large numbers to transform the original graphical model into a simplified graphical model in which inference is efficient. Inference in the simpified model provides bounds on probabilities of interest in the original model. We describe a general framework for generating variational transformations based on convex duality. Finally we return to the examples and demonstrate how variational algorithms can be formulated in each case.},
  number = {2},
  journaltitle = {Machine Learning},
  shortjournal = {Machine Learning},
  urldate = {2018-04-13},
  date = {1999-11-01},
  pages = {183-233},
  author = {Jordan, Michael I. and Ghahramani, Zoubin and Jaakkola, Tommi S. and Saul, Lawrence K.},
  file = {/Users/fergalcotter/Dropbox/Papers/Jordan et al_1999_An Introduction to Variational Methods for Graphical Models.pdf;/Users/fergalcotter/Zotero/storage/VU2E568V/A1007665907178.html},
  note = {02431}
}

@incollection{neal_view_1998,
  langid = {english},
  title = {A {{View}} of the {{Em Algorithm}} That {{Justifies Incremental}}, {{Sparse}}, and Other {{Variants}}},
  isbn = {978-94-010-6104-9 978-94-011-5014-9},
  url = {https://link.springer.com/chapter/10.1007/978-94-011-5014-9_12},
  abstract = {The EM algorithm performs maximum likelihood estimation for data in which some variables are unobserved. We present a function that resembles negative free energy and show that the M step maximizes this function with respect to the model parameters and the E step maximizes it with respect to the distribution over the unobserved variables. From this perspective, it is easy to justify an incremental variant of the EM algorithm in which the distribution for only one of the unobserved variables is recalculated in each E step. This variant is shown empirically to give faster convergence in a mixture estimation problem. A variant of the algorithm that exploits sparse conditional distributions is also described, and a wide range of other variant algorithms are also seen to be possible.},
  booktitle = {Learning in {{Graphical Models}}},
  series = {{{NATO ASI Series}}},
  publisher = {{Springer, Dordrecht}},
  urldate = {2018-04-13},
  date = {1998},
  pages = {355-368},
  author = {Neal, Radford M. and Hinton, Geoffrey E.},
  file = {/Users/fergalcotter/Dropbox/Papers/Neal and Hinton - 1998 - A View of the Em Algorithm that Justifies Incremen.pdf;/Users/fergalcotter/Zotero/storage/82D6ILII/978-94-011-5014-9_12.html},
  doi = {10.1007/978-94-011-5014-9_12},
  note = {02562}
}

@article{radovic_minimum_2017-1,
  title = {Minimum Redundancy Maximum Relevance Feature Selection Approach for Temporal Gene Expression Data},
  volume = {18},
  issn = {1471-2105},
  url = {https://doi.org/10.1186/s12859-016-1423-9},
  doi = {10.1186/s12859-016-1423-9},
  abstract = {Feature selection, aiming to identify a subset of features among a possibly large set of features that are relevant for predicting a response, is an important preprocessing step in machine learning. In gene expression studies this is not a trivial task for several reasons, including potential temporal character of data. However, most feature selection approaches developed for microarray data cannot handle multivariate temporal data without previous data flattening, which results in loss of temporal information.},
  journaltitle = {BMC Bioinformatics},
  shortjournal = {BMC Bioinformatics},
  urldate = {2018-04-16},
  date = {2017-01-03},
  pages = {9},
  keywords = {Feature selection,Gene expression,Temporal data},
  author = {Radovic, Milos and Ghalwash, Mohamed and Filipovic, Nenad and Obradovic, Zoran},
  file = {/Users/fergalcotter/Dropbox/Papers/Radovic et al_2017_Minimum redundancy maximum relevance feature selection approach for temporal.pdf;/Users/fergalcotter/Zotero/storage/IPBEGF4Z/s12859-016-1423-9.html}
}

@article{mucaki_predicting_2017,
  title = {Predicting {{Outcomes}} of {{Hormone}} and {{Chemotherapy}} in the {{Molecular~Taxonomy}} of~{{Breast Cancer~International~Consortium}} ({{METABRIC}}) {{Study}} by {{Biochemically}}-Inspired {{Machine Learning}}},
  volume = {5},
  issn = {2046-1402},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5461908/},
  doi = {10.12688/f1000research.9417.3},
  abstract = {Genomic aberrations and gene expression-defined subtypes in the large METABRIC patient cohort have been used to stratify and predict survival. The present study used normalized gene expression signatures of paclitaxel drug response to predict outcome for different survival times in METABRIC patients receiving hormone (HT) and, in some cases, chemotherapy (CT) agents. This machine learning method, which distinguishes sensitivity vs. resistance in breast cancer cell lines and validates predictions in patients; was also used to derive gene signatures of other HT ~(tamoxifen) and CT agents (methotrexate, epirubicin, doxorubicin, and 5-fluorouracil) used in METABRIC. Paclitaxel gene signatures exhibited the best performance, however the other agents also predicted survival with acceptable accuracies. A support vector machine (SVM) model of paclitaxel response containing genes~
ABCB1, ABCB11, ABCC1, ABCC10, BAD, BBC3, BCL2, BCL2L1, BMF, CYP2C8, CYP3A4, MAP2, MAP4, MAPT, NR1I2, SLCO1B3, TUBB1, TUBB4A, and
TUBB4B~was 78.6\% accurate in predicting survival of 84 patients treated with both HT and CT (median survival ≥ 4.4 yr). Accuracy was lower (73.4\%) in 304 untreated patients. The performance of other machine learning approaches was also evaluated at different survival thresholds. Minimum redundancy maximum relevance feature selection of a paclitaxel-based SVM classifier based on expression of genes~
BCL2L1, BBC3, FGF2, FN1,~and~
TWIST1
~was 81.1\% accurate in 53 CT patients. In addition, a random forest (RF) classifier using a gene signature (
ABCB1, ABCB11, ABCC1, ABCC10, BAD, BBC3, BCL2, BCL2L1, BMF, CYP2C8, CYP3A4, MAP2, MAP4, MAPT, NR1I2,SLCO1B3, TUBB1, TUBB4A,~and
TUBB4B) predicted {$>$}3-year survival with 85.5\% accuracy in 420 HT patients. A similar RF gene signature showed 82.7\% accuracy in 504 patients treated with CT and/or HT. These results suggest that tumor gene expression signatures refined by machine learning techniques can be useful for predicting survival after drug therapies.},
  journaltitle = {F1000Research},
  shortjournal = {F1000Res},
  urldate = {2018-04-23},
  date = {2017-05-12},
  author = {Mucaki, Eliseos J. and Baranova, Katherina and Pham, Huy Q. and Rezaeian, Iman and Angelov, Dimo and Ngom, Alioune and Rueda, Luis and Rogan, Peter K.},
  file = {/Users/fergalcotter/Dropbox/Papers/Mucaki et al_2017_Predicting Outcomes of Hormone and Chemotherapy in the Molecular Taxonomy.pdf},
  eprinttype = {pmid},
  eprint = {28620450},
  pmcid = {PMC5461908},
  note = {00004}
}

@article{curtis_genomic_2012,
  langid = {english},
  title = {The Genomic and Transcriptomic Architecture of 2,000 Breast Tumours Reveals Novel Subgroups},
  volume = {486},
  issn = {1476-4687},
  doi = {10.1038/nature10983},
  abstract = {The elucidation of breast cancer subgroups and their molecular drivers requires integrated views of the genome and transcriptome from representative numbers of patients. We present an integrated analysis of copy number and gene expression in a discovery and validation set of 997 and 995 primary breast tumours, respectively, with long-term clinical follow-up. Inherited variants (copy number variants and single nucleotide polymorphisms) and acquired somatic copy number aberrations (CNAs) were associated with expression in \textasciitilde{}40\% of genes, with the landscape dominated by cis- and trans-acting CNAs. By delineating expression outlier genes driven in cis by CNAs, we identified putative cancer genes, including deletions in PPP2R2A, MTAP and MAP2K4. Unsupervised analysis of paired DNA–RNA profiles revealed novel subgroups with distinct clinical outcomes, which reproduced in the validation cohort. These include a high-risk, oestrogen-receptor-positive 11q13/14 cis-acting subgroup and a favourable prognosis subgroup devoid of CNAs. Trans-acting aberration hotspots were found to modulate subgroup-specific gene networks, including a TCR deletion-mediated adaptive immune response in the ‘CNA-devoid’ subgroup and a basal-specific chromosome 5 deletion-associated mitotic network. Our results provide a novel molecular stratification of the breast cancer population, derived from the impact of somatic CNAs on the transcriptome.},
  number = {7403},
  journaltitle = {Nature},
  shortjournal = {Nature},
  date = {2012-04-18},
  pages = {346-352},
  keywords = {Genomics,Humans,Breast Neoplasms,DNA Copy Number Variations,Female,Gene Expression Profiling,Gene Expression Regulation; Neoplastic,Gene Regulatory Networks,Genes; Neoplasm,Genome; Human,Kaplan-Meier Estimate,MAP Kinase Kinase 4,Polymorphism; Single Nucleotide,Prognosis,Protein Phosphatase 2,Treatment Outcome},
  author = {Curtis, Christina and Shah, Sohrab P. and Chin, Suet-Feung and Turashvili, Gulisa and Rueda, Oscar M. and Dunning, Mark J. and Speed, Doug and Lynch, Andy G. and Samarajiwa, Shamith and Yuan, Yinyin and Gräf, Stefan and Ha, Gavin and Haffari, Gholamreza and Bashashati, Ali and Russell, Roslin and McKinney, Steven and {METABRIC Group} and Langerød, Anita and Green, Andrew and Provenzano, Elena and Wishart, Gordon and Pinder, Sarah and Watson, Peter and Markowetz, Florian and Murphy, Leigh and Ellis, Ian and Purushotham, Arnie and Børresen-Dale, Anne-Lise and Brenton, James D. and Tavaré, Simon and Caldas, Carlos and Aparicio, Samuel},
  file = {/Users/fergalcotter/Dropbox/Papers/Curtis et al. - 2012 - The genomic and transcriptomic architecture of 2,0.pdf},
  eprinttype = {pmid},
  eprint = {22522925},
  pmcid = {PMC3440846},
  note = {02229}
}

@article{anaya_oncolnc:_2016,
  langid = {english},
  title = {{{OncoLnc}}: Linking {{TCGA}} Survival Data to {{mRNAs}}, {{miRNAs}}, and {{lncRNAs}}},
  volume = {2},
  issn = {2376-5992},
  url = {https://peerj.com/articles/cs-67},
  doi = {10.7717/peerj-cs.67},
  shorttitle = {{{OncoLnc}}},
  abstract = {OncoLnc is a tool for interactively exploring survival correlations, and for downloading clinical data coupled to expression data for mRNAs, miRNAs, or long noncoding RNAs (lncRNAs). OncoLnc contains survival data for 8,647 patients from 21 cancer studies performed by The Cancer Genome Atlas (TCGA), along with RNA-SEQ expression for mRNAs and miRNAs from TCGA, and lncRNA expression from MiTranscriptome beta. Storing this data gives users the ability to separate patients by gene expression, and then create publication-quality Kaplan-Meier plots or download the data for further analyses. OncoLnc also stores precomputed survival analyses, allowing users to quickly explore survival correlations for up to 21 cancers in a single click. This resource allows researchers studying a specific gene to quickly investigate if it may have a role in cancer, and the supporting data allows researchers studying a specific cancer to identify the mRNAs, miRNAs, and lncRNAs most correlated with survival, and researchers looking for a novel lncRNA involved with cancer lists of potential candidates. OncoLnc is available at http://www.oncolnc.org.},
  journaltitle = {PeerJ Computer Science},
  shortjournal = {PeerJ Comput. Sci.},
  urldate = {2018-04-23},
  date = {2016-06-13},
  pages = {e67},
  author = {Anaya, Jordan},
  file = {/Users/fergalcotter/Dropbox/Papers/Anaya_2016_OncoLnc.pdf;/Users/fergalcotter/Zotero/storage/PJ2P3CVG/cs-67.html},
  note = {00062}
}

@article{kaplan_nonparametric_1958,
  eprinttype = {jstor},
  eprint = {2281868},
  title = {Nonparametric {{Estimation}} from {{Incomplete Observations}}},
  volume = {53},
  issn = {0162-1459},
  doi = {10.2307/2281868},
  abstract = {In lifetesting, medical follow-up, and other fields the observation of the time of occurrence of the event of interest (called a death) may be prevented for some of the items of the sample by the previous occurrence of some other event (called a loss). Losses may be either accidental or controlled, the latter resulting from a decision to terminate certain observations. In either case it is usually assumed in this paper that the lifetime (age at death) is independent of the potential loss time; in practice this assumption deserves careful scrutiny. Despite the resulting incompleteness of the data, it is desired to estimate the proportion P(t) of items in the population whose lifetimes would exceed t (in the absence of such losses), without making any assumption about the form of the function P(t). The observation for each item of a suitable initial event, marking the beginning of its lifetime, is presupposed. For random samples of size N the product-limit (PL) estimate can be defined as follows: List and label the N observed lifetimes (whether to death or loss) in order of increasing magnitude, so that one has 0 ≤ t\textsubscript{1}' ≤ t\textsubscript{2}' ≤ ⋯ ≤ t\textsubscript{N}'. Then {$<$}tex-math{$>\$\backslash$}hat\{P\}(t) = \textbackslash{}prod\_r \textbackslash{}lbrack(N - r)/(N - r + 1)\textbackslash{}rbrack\${$<$}/tex-math{$>$}, where r assumes those values for which t\textsubscript{r}' ≤ t and for which t\textsubscript{r}' measures the time to death. This estimate is the distribution, unrestricted as to form, which maximizes the likelihood of the observations. Other estimates that are discussed are the actuarial estimates (which are also products, but with the number of factors usually reduced by grouping); and reduced-sample (RS) estimates, which require that losses not be accidental, so that the limits of observation (potential loss times) are known even for those items whose deaths are observed. When no losses occur at ages less than t, the estimate of P(t) in all cases reduces to the usual binomial estimate, namely, the observed proportion of survivors.},
  number = {282},
  journaltitle = {Journal of the American Statistical Association},
  date = {1958},
  pages = {457-481},
  author = {Kaplan, E. L. and Meier, Paul},
  file = {/Users/fergalcotter/Dropbox/Papers/Kaplan and Meier - 1958 - Nonparametric Estimation from Incomplete Observati.pdf},
  note = {53745}
}

@article{wiatowski_mathematical_2015,
  title = {A {{Mathematical Theory}} of {{Deep Convolutional Neural Networks}} for {{Feature Extraction}}},
  volume = {PP},
  doi = {10.1109/TIT.2017.2776228},
  abstract = {Deep convolutional neural networks have led to breakthrough results in practical feature extraction applications. The mathematical analysis of such networks was initiated by Mallat, 2012. Specifically, Mallat considered so-called scattering networks based on semi-discrete shift-invariant wavelet frames and modulus non-linearities in each network layer, and proved translation invariance (asymptotically in the wavelet scale parameter) and deformation stability of the corresponding feature extractor. The purpose of this paper is to develop Mallat's theory further by allowing for general convolution kernels, or in more technical parlance, general semi-discrete shift-invariant frames (including Weyl-Heisenberg, curvelet, shearlet, ridgelet, and wavelet frames) and general Lipschitz-continuous non-linearities (e.g., rectified linear units, shifted logistic sigmoids, hyperbolic tangents, and modulus functions), as well as pooling through sub-sampling, all of which can be different in different network layers. The resulting generalized network enables extraction of significantly wider classes of features than those resolved by Mallat's wavelet-modulus scattering network. We prove deformation stability for a larger class of deformations than those considered by Mallat, and we establish a new translation invariance result which is of vertical nature in the sense of the network depth determining the amount of invariance. Moreover, our results establish that deformation stability and vertical translation invariance are guaranteed by the network structure per se rather than the specific convolution kernels and non-linearities. This offers an explanation for the tremendous success of deep convolutional neural networks in a wide variety of practical feature extraction applications. The mathematical techniques we employ are based on continuous frame theory.},
  journaltitle = {IEEE Transactions on Information Theory},
  date = {2015-12-19},
  author = {Wiatowski, Thomas and Bölcskei, Helmut},
  file = {/Users/fergalcotter/Dropbox/Papers/Wiatowski and Bölcskei - 2015 - A Mathematical Theory of Deep Convolutional Neural.pdf},
  note = {00051}
}

@article{han_analysis_2018,
  langid = {english},
  title = {Analysis of the Cancer Genome Atlas ({{TCGA}}) Database Identifies an Inverse Relationship between Interleukin-13 Receptor Α1 and Α2 Gene Expression and Poor Prognosis and Drug Resistance in Subjects with Glioblastoma Multiforme},
  volume = {136},
  issn = {0167-594X, 1573-7373},
  url = {https://link.springer.com/article/10.1007/s11060-017-2680-9},
  doi = {10.1007/s11060-017-2680-9},
  abstract = {Glioblastoma multiforme (GBM) is the most common primary brain tumor in adults. A variety of targeted agents are being tested in the clinic including cancer vaccines, immunotoxins, antibodies and T cell immunotherapy for GBM. We have previously reported that IL-13 receptor subunits α1 and α2 of IL-13R complex are overexpressed in GBM. We are investigating the significance of IL-13Rα1 and α2 expression in GBM tumors. In order to elucidate a possible relationship between IL-13Rα1 and α2 expression with severity and prognoses of subjects with GBM, we analyzed gene expression (by microarray) and clinical data available at the public The Cancer Genome Atlas (TCGA) database (Currently known as Global Data Commons). More than 40\% of GBM samples were highly positive for IL-13Rα2 mRNA (Log2 ≥ 2) while only less than 16\% samples were highly positive for IL-13Rα1 mRNA. Subjects with high IL-13Rα1 and α2 mRNA expressing tumors were associated with a significantly lower survival rate irrespective of their treatment compared to subjects with IL-13Rα1 and α2 mRNA negative tumors. We further observed that IL-13Rα2 gene expression is associated with GBM resistance to temozolomide (TMZ) chemotherapy. The expression of IL-13Rα2 gene did not seem to correlate with the expression of genes for other chains involved in the formation of IL-13R complex (IL-13Rα1 or IL-4Rα) in GBM. However, a positive correlation was observed between IL-4Rα and IL-13Rα1 gene expression. The microarray data of IL-13Rα2 gene expression was verified by RNA-Seq data. In depth analysis of TCGA data revealed that immunosuppressive genes (such as FMOD, CCL2, OSM, etc.) were highly expressed in IL-13Rα2 positive tumors, but not in IL-13Rα2 negative tumors. These results indicate a direct correlation between high level of IL-13R mRNA expression and poor patient prognosis and that immunosuppressive genes associated with IL-13Rα2 may play a role in tumor progression. These findings have important implications in understanding the role of IL-13R in the pathogenesis of GBM and potentially other cancers.},
  number = {3},
  journaltitle = {Journal of Neuro-Oncology},
  shortjournal = {J Neurooncol},
  urldate = {2018-04-30},
  date = {2018-02-01},
  pages = {463-474},
  author = {Han, Jing and Puri, Raj K.},
  file = {/Users/fergalcotter/Zotero/storage/CURBQK26/Han and Puri - 2018 - Analysis of the cancer genome atlas (TCGA) databas.pdf;/Users/fergalcotter/Zotero/storage/C2NB9W92/10.html},
  note = {00001}
}

@article{kim_identification_2013,
  langid = {english},
  title = {Identification of Prognostic Gene Signatures of Glioblastoma: A Study Based on {{TCGA}} Data Analysis},
  volume = {15},
  issn = {1523-5866},
  doi = {10.1093/neuonc/not024},
  shorttitle = {Identification of Prognostic Gene Signatures of Glioblastoma},
  abstract = {BACKGROUND: The Cancer Genome Atlas (TCGA) project is a large-scale effort with the goal of identifying novel molecular aberrations in glioblastoma (GBM).
METHODS: Here, we describe an in-depth analysis of gene expression data and copy number aberration (CNA) data to classify GBMs into prognostic groups to determine correlates of subtypes that may be biologically significant.
RESULTS: To identify predictive survival models, we searched TCGA in 173 patients and identified 42 probe sets (P = .0005) that could be used to divide the tumor samples into 3 groups and showed a significantly (P = .0006) improved overall survival. Kaplan-Meier plots showed that the median survival of group 3 was markedly longer (127 weeks) than that of groups 1 and 2 (47 and 52 weeks, respectively). We then validated the 42 probe sets to stratify the patients according to survival in other public GBM gene expression datasets (eg, GSE4290 dataset). An overall analysis of the gene expression and copy number aberration using a multivariate Cox regression model showed that the 42 probe sets had a significant (P {$<$} .018) prognostic value independent of other variables.
CONCLUSIONS: By integrating multidimensional genomic data from TCGA, we identified a specific survival model in a new prognostic group of GBM and suggest that molecular stratification of patients with GBM into homogeneous subgroups may provide opportunities for the development of new treatment modalities.},
  number = {7},
  journaltitle = {Neuro-Oncology},
  shortjournal = {Neuro-oncology},
  date = {2013-07},
  pages = {829-839},
  keywords = {Genomics,Humans,Models; Statistical,DNA Copy Number Variations,Gene Expression Profiling,Gene Expression Regulation; Neoplastic,Gene Regulatory Networks,Prognosis,Biomarkers; Tumor,Brain Neoplasms,Case-Control Studies,comparative genomic hybridization,Computational Biology,Databases; Genetic,EMT,Epithelial-Mesenchymal Transition,gene expression,glioblastoma,Glioblastoma,Oligonucleotide Array Sequence Analysis,prognostic marker,RNA; Messenger,Survival Rate,TCGA},
  author = {Kim, Yong-Wan and Koul, Dimpy and Kim, Se Hoon and Lucio-Eterovic, Agda Karina and Freire, Pablo R. and Yao, Jun and Wang, Jing and Almeida, Jonas S. and Aldape, Ken and Yung, W. K. Alfred},
  file = {/Users/fergalcotter/Dropbox/Papers/Kim et al. - 2013 - Identification of prognostic gene signatures of gl.pdf},
  eprinttype = {pmid},
  eprint = {23502430},
  pmcid = {PMC3688008},
  note = {00048}
}

@inproceedings{polsterl_fast_2015,
  langid = {english},
  title = {Fast {{Training}} of {{Support Vector Machines}} for {{Survival Analysis}}},
  isbn = {978-3-319-23524-0 978-3-319-23525-7},
  url = {https://link.springer.com/chapter/10.1007/978-3-319-23525-7_15},
  doi = {10.1007/978-3-319-23525-7_15},
  abstract = {Survival analysis is a commonly used technique to identify important predictors of adverse events and develop guidelines for patient’s treatment in medical research. When applied to large amounts of patient data, efficient optimization routines become a necessity. We propose efficient training algorithms for three kinds of linear survival support vector machines: 1) ranking-based, 2) regression-based, and 3) combined ranking and regression. We perform optimization in the primal using truncated Newton optimization and use order statistic trees to lower computational costs of training. We employ the same optimization technique and extend it for non-linear models too. Our results demonstrate the superiority of our proposed optimization scheme over existing training algorithms, which fail due to their inherently high time and space complexities when applied to large datasets. We validate the proposed survival models on 6 real-world datasets, and show that pure ranking-based approaches outperform regression and hybrid models.},
  eventtitle = {Joint {{European Conference}} on {{Machine Learning}} and {{Knowledge Discovery}} in {{Databases}}},
  booktitle = {Machine {{Learning}} and {{Knowledge Discovery}} in {{Databases}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer, Cham}},
  urldate = {2018-04-30},
  date = {2015-09-07},
  pages = {243-259},
  author = {Pölsterl, Sebastian and Navab, Nassir and Katouzian, Amin},
  file = {/Users/fergalcotter/Zotero/storage/N996Y8YW/10.html},
  note = {00007}
}

@thesis{singh_scatternet_2018,
  location = {{Cambridge, UK}},
  title = {{{ScatterNet Hybrid Frameworks}} for {{Deep Learning}}},
  pagetotal = {158},
  institution = {{University of Cambridge}},
  type = {PhD Thesis},
  date = {2018-05},
  author = {Singh, Amarjot},
  file = {/Users/fergalcotter/Dropbox/Papers/Singh_2018_ScatterNet Hybrid Frameworks for Deep Learning.pdf},
  note = {00000}
}

@article{cox_regression_1972,
  eprinttype = {jstor},
  eprint = {2985181},
  title = {Regression {{Models}} and {{Life}}-{{Tables}}},
  volume = {34},
  issn = {0035-9246},
  abstract = {The analysis of censored failure times is considered. It is assumed that on each individual are available values of one or more explanatory variables. The hazard function (age-specific failure rate) is taken to be a function of the explanatory variables and unknown regression coefficients multiplied by an arbitrary and unknown function of time. A conditional likelihood is obtained, leading to inferences about the unknown regression coefficients. Some generalizations are outlined.},
  number = {2},
  journaltitle = {Journal of the Royal Statistical Society. Series B (Methodological)},
  date = {1972},
  pages = {187-220},
  author = {Cox, D. R.},
  file = {/Users/fergalcotter/Dropbox/Papers/Cox - 1972 - Regression Models and Life-Tables.pdf},
  note = {47611}
}

@article{bazot_unsupervised_2013,
  title = {Unsupervised {{Bayesian}} Linear Unmixing of Gene Expression Microarrays},
  volume = {14},
  issn = {1471-2105},
  url = {https://doi.org/10.1186/1471-2105-14-99},
  doi = {10.1186/1471-2105-14-99},
  abstract = {This paper introduces a new constrained model and the corresponding algorithm, called unsupervised Bayesian linear unmixing (uBLU), to identify biological signatures from high dimensional assays like gene expression microarrays. The basis for uBLU is a Bayesian model for the data samples which are represented as an additive mixture of random positive gene signatures, called factors, with random positive mixing coefficients, called factor scores, that specify the relative contribution of each signature to a specific sample. The particularity of the proposed method is that uBLU constrains the factor loadings to be non-negative and the factor scores to be probability distributions over the factors. Furthermore, it also provides estimates of the number of factors. A Gibbs sampling strategy is adopted here to generate random samples according to the posterior distribution of the factors, factor scores, and number of factors. These samples are then used to estimate all the unknown parameters.},
  journaltitle = {BMC Bioinformatics},
  shortjournal = {BMC Bioinformatics},
  urldate = {2018-05-16},
  date = {2013-03-19},
  pages = {99},
  keywords = {Factor Score,Independent Component Analysis,Inflammatory Component,Markov Chain Monte Carlo,Switch Move},
  author = {Bazot, Cécile and Dobigeon, Nicolas and Tourneret, Jean-Yves and Zaas, Aimee K. and Ginsburg, Geoffrey S. and O Hero III, Alfred},
  file = {/Users/fergalcotter/Dropbox/Papers/Bazot et al_2013_Unsupervised Bayesian linear unmixing of gene expression microarrays.pdf;/Users/fergalcotter/Zotero/storage/NH4FZ38R/1471-2105-14-99.html},
  note = {00021}
}

@inproceedings{bazot_bernoulli-gaussian_2011,
  title = {A {{Bernoulli}}-{{Gaussian}} Model for Gene Factor Analysis},
  doi = {10.1109/ICASSP.2011.5947728},
  abstract = {This paper investigates a Bayesian model and a Markov chain Monte Carlo (MCMC) algorithm for gene factor analysis. Each sample in the dataset is decomposed as a linear combination of characteristic gene signatures (also referred to as factors) following a linear mixing model. To enforce the sparsity of the relative contribution (called factor score) of each gene signature to a specific sample, constrained Bernoulli-Gaussian distributions are elected as prior distributions for these factor scores. This distribution allows one to ensure non-negativity and full-additivity constraints for the scores that are interpreted as concentrations. The complexity of the resulting Bayesian estimators is alleviated by using a Gibbs sampler which generates samples distributed according to the posterior distribution of interest. These samples are then used to approximate the standard maximum a posteriori (MAP) or minimum mean square error (MMSE) estimators. The accuracy of the proposed Bayesian method is illustrated by simulations conducted on synthetic and real data.},
  eventtitle = {2011 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
  booktitle = {2011 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
  date = {2011-05},
  pages = {5996-5999},
  keywords = {Bayes methods,Bayesian methods,Joints,Markov processes,matrix decomposition,Gaussian distribution,Gene expression,Bayesian estimators,Bayesian inference,Bayesian model,Bernoulli-Gaussian distributions,characteristic gene signature,factor analysis,factor score,full-additivity constraints,gene expression data,gene factor analysis,genetics,Gibbs sampler,Indexes,linear mixing model,Loading,Markov chain Monte Carlo algorithm,maximum a posteriori estimators,maximum likelihood estimation,MCMC methods,minimum mean square error estimators,Monte Carlo methods,nonnegativity constraint,posterior distribution,Principal component analysis},
  author = {Bazot, C. and Dobigeon, N. and Tourneret, J. Y. and Hero, A. O.},
  file = {/Users/fergalcotter/Zotero/storage/JRF392RC/5947728.html},
  note = {00006}
}

@article{zhang_wavelet_1992,
  langid = {english},
  title = {Wavelet Networks},
  volume = {3},
  issn = {1045-9227},
  doi = {10.1109/72.165591},
  abstract = {A wavelet network concept, which is based on wavelet transform theory, is proposed as an alternative to feedforward neural networks for approximating arbitrary nonlinear functions. The basic idea is to replace the neurons by ;wavelons', i.e., computing units obtained by cascading an affine transform and a multidimensional wavelet. Then these affine transforms and the synaptic weights must be identified from possibly noise corrupted input/output data. An algorithm of backpropagation type is proposed for wavelet network training, and experimental results are reported.},
  number = {6},
  journaltitle = {IEEE transactions on neural networks},
  shortjournal = {IEEE Trans Neural Netw},
  date = {1992},
  pages = {889-898},
  author = {Zhang, Q. and Benveniste, A.},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhang and Benveniste - 1992 - Wavelet networks.pdf},
  eprinttype = {pmid},
  eprint = {18276486}
}

@article{recoskie_learning_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1802.02961},
  primaryClass = {cs, stat},
  title = {Learning {{Sparse Wavelet Representations}}},
  url = {http://arxiv.org/abs/1802.02961},
  abstract = {In this work we propose a method for learning wavelet filters directly from data. We accomplish this by framing the discrete wavelet transform as a modified convolutional neural network. We introduce an autoencoder wavelet transform network that is trained using gradient descent. We show that the model is capable of learning structured wavelet filters from synthetic and real data. The learned wavelets are shown to be similar to traditional wavelets that are derived using Fourier methods. Our method is simple to implement and easily incorporated into neural network architectures. A major advantage to our model is that we can learn from raw audio data.},
  urldate = {2018-05-22},
  date = {2018-02-08},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Recoskie, Daniel and Mann, Richard},
  file = {/Users/fergalcotter/Dropbox/Papers/Recoskie_Mann_2018_Learning Sparse Wavelet Representations.pdf;/Users/fergalcotter/Zotero/storage/9I74ZZDI/1802.html},
  note = {00000}
}

@inproceedings{huang_wavelet-srnet:_2017,
  title = {Wavelet-{{SRNet}}: {{A Wavelet}}-{{Based CNN}} for {{Multi}}-Scale {{Face Super Resolution}}},
  doi = {10.1109/ICCV.2017.187},
  shorttitle = {Wavelet-{{SRNet}}},
  abstract = {Most modern face super-resolution methods resort to convolutional neural networks (CNN) to infer highresolution (HR) face images. When dealing with very low resolution (LR) images, the performance of these CNN based methods greatly degrades. Meanwhile, these methods tend to produce over-smoothed outputs and miss some textural details. To address these challenges, this paper presents a wavelet-based CNN approach that can ultra-resolve a very low resolution face image of 16 × 16 or smaller pixelsize to its larger version of multiple scaling factors (2×, 4×, 8× and even 16×) in a unified framework. Different from conventional CNN methods directly inferring HR images, our approach firstly learns to predict the LR's corresponding series of HR's wavelet coefficients before reconstructing HR images from them. To capture both global topology information and local texture details of human faces, we present a flexible and extensible convolutional neural network with three types of loss: wavelet prediction loss, texture loss and full-image loss. Extensive experiments demonstrate that the proposed approach achieves more appealing results both quantitatively and qualitatively than state-ofthe- art super-resolution methods.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  date = {2017-10},
  pages = {1698-1706},
  keywords = {Image reconstruction,Image resolution,Wavelet domain,Wavelet packets,image texture,inference mechanisms,learning (artificial intelligence),wavelet transforms,neural nets,convolutional neural networks,image reconstruction,image representation,wavelet coefficients,CNN approach,Face,face recognition,full-image loss,global topology information,high resolution face images,HR images reconstruction,image resolution,low resolution face image,LR images,multiple scaling factors,multiscale face super resolution,textural details,texture loss,topology,wavelet prediction loss,wavelet-SRNet},
  author = {Huang, H. and He, R. and Sun, Z. and Tan, T.},
  file = {/Users/fergalcotter/Dropbox/Papers/Huang et al_2017_Wavelet-SRNet.pdf;/Users/fergalcotter/Zotero/storage/26KMYLR6/8237449.html},
  note = {00006}
}

@article{kang_deep_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1610.09736},
  title = {A Deep Convolutional Neural Network Using Directional Wavelets for Low-Dose {{X}}-Ray {{CT}} Reconstruction},
  volume = {44},
  issn = {00942405},
  url = {http://arxiv.org/abs/1610.09736},
  doi = {10.1002/mp.12344},
  abstract = {Due to the potential risk of inducing cancers, radiation dose of X-ray CT should be reduced for routine patient scanning. However, in low-dose X-ray CT, severe artifacts usually occur due to photon starvation, beamhardening, etc, which decrease the reliability of diagnosis. Thus, high quality reconstruction from low-dose X-ray CT data has become one of the important research topics in CT community. Conventional model-based denoising approaches are, however, computationally very expensive, and image domain denoising approaches hardly deal with CT specific noise patterns. To address these issues, we propose an algorithm using a deep convolutional neural network (CNN), which is applied to wavelet transform coefficients of low-dose CT images. Specifically, by using a directional wavelet transform for extracting directional component of artifacts and exploiting the intra- and inter-band correlations, our deep network can effectively suppress CT specific noises. Moreover, our CNN is designed to have various types of residual learning architecture for faster network training and better denoising. Experimental results confirm that the proposed algorithm effectively removes complex noise patterns of CT images, originated from the reduced X-ray dose. In addition, we show that wavelet domain CNN is efficient in removing the noises from low-dose CT compared to an image domain CNN. Our results were rigorously evaluated by several radiologists and won the second place award in 2016 AAPM Low-Dose CT Grand Challenge. To the best of our knowledge, this work is the first deep learning architecture for low-dose CT reconstruction that has been rigorously evaluated and proven for its efficacy.},
  number = {10},
  journaltitle = {Medical Physics},
  urldate = {2018-05-22},
  date = {2017-10},
  pages = {e360-e375},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Kang, Eunhee and Min, Junhong and Ye, Jong Chul},
  file = {/Users/fergalcotter/Dropbox/Papers/Kang et al_2017_A deep convolutional neural network using directional wavelets for low-dose.pdf;/Users/fergalcotter/Zotero/storage/3AGZVGRL/1610.html},
  note = {00040}
}

@article{balestriero_linear_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1707.05841},
  primaryClass = {cs, stat},
  title = {Linear {{Time Complexity Deep Fourier Scattering Network}} and {{Extension}} to {{Nonlinear Invariants}}},
  url = {http://arxiv.org/abs/1707.05841},
  abstract = {In this paper we propose a scalable version of a state-of-the-art deterministic time-invariant feature extraction approach based on consecutive changes of basis and nonlinearities, namely, the scattering network. The first focus of the paper is to extend the scattering network to allow the use of higher order nonlinearities as well as extracting nonlinear and Fourier based statistics leading to the required invariants of any inherently structured input. In order to reach fast convolutions and to leverage the intrinsic structure of wavelets, we derive our complete model in the Fourier domain. In addition of providing fast computations, we are now able to exploit sparse matrices due to extremely high sparsity well localized in the Fourier domain. As a result, we are able to reach a true linear time complexity with inputs in the Fourier domain allowing fast and energy efficient solutions to machine learning tasks. Validation of the features and computational results will be presented through the use of these invariant coefficients to perform classification on audio recordings of bird songs captured in multiple different soundscapes. In the end, the applicability of the presented solutions to deep artificial neural networks is discussed.},
  urldate = {2018-05-25},
  date = {2017-07-18},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Balestriero, Randall and Glotin, Herve},
  file = {/Users/fergalcotter/Dropbox/Papers/Balestriero_Glotin_2017_Linear Time Complexity Deep Fourier Scattering Network and Extension to.pdf;/Users/fergalcotter/Zotero/storage/K43K5VAZ/1707.html},
  note = {00002}
}

@article{lin_focal_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1708.02002},
  primaryClass = {cs},
  title = {Focal {{Loss}} for {{Dense Object Detection}}},
  url = {http://arxiv.org/abs/1708.02002},
  abstract = {The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. In this paper, we investigate why this is the case. We discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. We propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. Our novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. To evaluate the effectiveness of our loss, we design and train a simple dense detector we call RetinaNet. Our results show that when trained with the focal loss, RetinaNet is able to match the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors. Code is at: https://github.com/facebookresearch/Detectron.},
  urldate = {2018-05-25},
  date = {2017-08-07},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Dollár, Piotr},
  file = {/Users/fergalcotter/Dropbox/Papers/Lin et al_2017_Focal Loss for Dense Object Detection.pdf;/Users/fergalcotter/Zotero/storage/QZEL4AAW/1708.html},
  note = {00105}
}

@article{redmon_yolov3:_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1804.02767},
  primaryClass = {cs},
  title = {{{YOLOv3}}: {{An Incremental Improvement}}},
  url = {http://arxiv.org/abs/1804.02767},
  shorttitle = {{{YOLOv3}}},
  abstract = {We present some updates to YOLO! We made a bunch of little design changes to make it better. We also trained this new network that's pretty swell. It's a little bigger than last time but more accurate. It's still fast though, don't worry. At 320x320 YOLOv3 runs in 22 ms at 28.2 mAP, as accurate as SSD but three times faster. When we look at the old .5 IOU mAP detection metric YOLOv3 is quite good. It achieves 57.9 mAP@50 in 51 ms on a Titan X, compared to 57.5 mAP@50 in 198 ms by RetinaNet, similar performance but 3.8x faster. As always, all the code is online at https://pjreddie.com/yolo/},
  urldate = {2018-05-25},
  date = {2018-04-08},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Redmon, Joseph and Farhadi, Ali},
  file = {/Users/fergalcotter/Dropbox/Papers/Redmon_Farhadi_2018_YOLOv3.pdf;/Users/fergalcotter/Zotero/storage/FPTXN756/1804.html},
  note = {00001}
}

@article{sendur_bivariate_2002-2,
  title = {Bivariate Shrinkage Functions for Wavelet-Based Denoising Exploiting Interscale Dependency},
  volume = {50},
  issn = {1053-587X},
  doi = {10.1109/TSP.2002.804091},
  abstract = {Most simple nonlinear thresholding rules for wavelet-based denoising assume that the wavelet coefficients are independent. However, wavelet coefficients of natural images have significant dependencies. We only consider the dependencies between the coefficients and their parents in detail. For this purpose, new non-Gaussian bivariate distributions are proposed, and corresponding nonlinear threshold functions (shrinkage functions) are derived from the models using Bayesian estimation theory. The new shrinkage functions do not assume the independence of wavelet coefficients. We show three image denoising examples in order to show the performance of these new bivariate shrinkage rules. In the second example, a simple subband-dependent data-driven image denoising system is described and compared with effective data-driven techniques in the literature, namely VisuShrink, SureShrink, BayesShrink, and hidden Markov models. In the third example, the same idea is applied to the dual-tree complex wavelet coefficients.},
  number = {11},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {2002-11},
  pages = {2744-2756},
  keywords = {Bayes methods,Bayesian methods,Hidden Markov models,Noise reduction,Wavelet coefficients,image denoising,image processing,noise,wavelet transforms,Signal processing algorithms,Image coding,Wavelet transforms,Image denoising,wavelet coefficients,Bayesian estimation theory,BayesShrink,bivariate shrinkage functions,bivariate shrinkage rules,dual-tree complex wavelet coefficients,estimation theory,Estimation theory,hidden Markov models,HMM,interscale dependency,nonGaussian bivariate distributions,nonlinear functions,nonlinear threshold functions,nonlinear thresholding rules,Probability distribution,subband-dependent image denoising system,SureShrink,VisuShrink,wavelet-based denoising},
  author = {Sendur, L. and Selesnick, I. W.},
  file = {/Users/fergalcotter/Zotero/storage/9T7RE4VN/Sendur and Selesnick - 2002 - Bivariate shrinkage functions for wavelet-based de.pdf;/Users/fergalcotter/Zotero/storage/QYP9GPCB/1041032.html},
  note = {01168}
}

@article{mairal_sparse_2014-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1411.3230},
  primaryClass = {cs},
  title = {Sparse {{Modeling}} for {{Image}} and {{Vision Processing}}},
  url = {http://arxiv.org/abs/1411.3230},
  abstract = {In recent years, a large amount of multi-disciplinary research has been conducted on sparse models and their applications. In statistics and machine learning, the sparsity principle is used to perform model selection---that is, automatically selecting a simple model among a large collection of them. In signal processing, sparse coding consists of representing data with linear combinations of a few dictionary elements. Subsequently, the corresponding tools have been widely adopted by several scientific communities such as neuroscience, bioinformatics, or computer vision. The goal of this monograph is to offer a self-contained view of sparse modeling for visual recognition and image processing. More specifically, we focus on applications where the dictionary is learned and adapted to data, yielding a compact representation that has been successful in various contexts.},
  urldate = {2018-06-06},
  date = {2014-11-12},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Mairal, Julien and Bach, Francis and Ponce, Jean},
  file = {/Users/fergalcotter/Zotero/storage/FFPCMNWE/Mairal et al. - 2014 - Sparse Modeling for Image and Vision Processing.pdf;/Users/fergalcotter/Zotero/storage/ZJ244MCH/1411.html}
}

@thesis{rippel_sculpting_2016,
  langid = {english},
  title = {Sculpting Representations for Deep Learning},
  url = {http://dspace.mit.edu/handle/1721.1/104581},
  abstract = {In machine learning, the choice of space in which to represent our data is of vital importance to their effective and efficient analysis. In this thesis, we develop approaches to address a number of problems in representation learning. We employ deep learning as means of sculpting our representations, and also develop improved representations for deep learning models. We present contributions that are based on five papers and make progress in several different research directions. First, we present techniques which leverage spatial and relational structure to achieve greater computational efficiency of model optimization and query retrieval. This allows us to train distance metric learning models 5-30 times faster; optimize convolutional neural networks 2-5 times faster; perform content-based image retrieval hundreds of times faster on codes hundreds of times longer than feasible before; and improve the complexity of Bayesian optimization to linear in the number of observations in contrast to the cubic dependence in its naive Gaussian process formulation. Furthermore, we introduce ideas to facilitate preservation of relevant information within the learned representations, and demonstrate this leads to improved supervision results. Our approaches achieve state-of-the-art classification and transfer learning performance on a number of well-known machine learning benchmarks. In addition, while deep learning models are able to discover structure in high dimensional input domains, they only offer implicit probabilistic descriptions. We develop an algorithm to enable probabilistic interpretability of deep representations. It constructs a transformation to a representation space under which the map of the distribution is approximately factorized and has known marginals. This allows tractable density estimation and.inference within this alternate domain.},
  institution = {{Massachusetts Institute of Technology}},
  type = {Thesis},
  urldate = {2018-06-07},
  date = {2016},
  author = {Rippel, Oren},
  file = {/Users/fergalcotter/Dropbox/Papers/Rippel_2016_Sculpting representations for deep learning.pdf;/Users/fergalcotter/Zotero/storage/6J3GN6M9/104581.html},
  note = {00000}
}

@article{recoskie_gradient-based_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1806.01793},
  primaryClass = {cs, eess, stat},
  title = {Gradient-Based {{Filter Design}} for the {{Dual}}-Tree {{Wavelet Transform}}},
  url = {http://arxiv.org/abs/1806.01793},
  abstract = {The wavelet transform has seen success when incorporated into neural network architectures, such as in wavelet scattering networks. More recently, it has been shown that the dual-tree complex wavelet transform can provide better representations than the standard transform. With this in mind, we extend our previous method for learning filters for the 1D and 2D wavelet transforms into the dual-tree domain. We show that with few modifications to our original model, we can learn directional filters that leverage the properties of the dual-tree wavelet transform.},
  urldate = {2018-06-14},
  date = {2018-06-04},
  keywords = {Computer Science - Learning,Statistics - Machine Learning,Electrical Engineering and Systems Science - Signal Processing},
  author = {Recoskie, Daniel and Mann, Richard},
  file = {/Users/fergalcotter/Dropbox/Papers/Recoskie_Mann_2018_Gradient-based Filter Design for the Dual-tree Wavelet Transform.pdf;/Users/fergalcotter/Zotero/storage/YISLZHGS/1806.html},
  note = {00000}
}

@article{lin_microsoft_2014-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1405.0312},
  primaryClass = {cs},
  title = {Microsoft {{COCO}}: {{Common Objects}} in {{Context}}},
  url = {http://arxiv.org/abs/1405.0312},
  shorttitle = {Microsoft {{COCO}}},
  abstract = {We present a new dataset with the goal of advancing the state-of-the-art in object recognition by placing the question of object recognition in the context of the broader question of scene understanding. This is achieved by gathering images of complex everyday scenes containing common objects in their natural context. Objects are labeled using per-instance segmentations to aid in precise object localization. Our dataset contains photos of 91 objects types that would be easily recognizable by a 4 year old. With a total of 2.5 million labeled instances in 328k images, the creation of our dataset drew upon extensive crowd worker involvement via novel user interfaces for category detection, instance spotting and instance segmentation. We present a detailed statistical analysis of the dataset in comparison to PASCAL, ImageNet, and SUN. Finally, we provide baseline performance analysis for bounding box and segmentation detection results using a Deformable Parts Model.},
  urldate = {2018-06-16},
  date = {2014-05-01},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Bourdev, Lubomir and Girshick, Ross and Hays, James and Perona, Pietro and Ramanan, Deva and Zitnick, C. Lawrence and Dollár, Piotr},
  file = {/Users/fergalcotter/Dropbox/Papers/Lin et al_2014_Microsoft COCO2.pdf;/Users/fergalcotter/Zotero/storage/YCHN3BBG/1405.html},
  note = {02440}
}

@inproceedings{urda_deep_2017,
  title = {Deep {{Learning}} to {{Analyze RNA}}-{{Seq Gene Expression Data}}},
  isbn = {978-3-319-59146-9},
  doi = {10.1007/978-3-319-59147-6_5},
  abstract = {Deep learning models are currently being applied in several areas with great success. However, their application for the analysis of high-throughput sequencing data remains a challenge for the research community due to the fact that this family of models are known to work very well in big datasets with lots of samples available, just the opposite scenario typically found in biomedical areas. In this work, a first approximation on the use of deep learning for the analysis of RNA-Seq gene expression profiles data is provided. Three public cancer-related databases are analyzed using a regularized linear model (standard LASSO) as baseline model, and two deep learning models that differ on the feature selection technique used prior to the application of a deep neural net model. The results indicate that a straightforward application of deep nets implementations available in public scientific tools and under the conditions described within this work is not enough to outperform simpler models like LASSO. Therefore, smarter and more complex ways that incorporate prior biological knowledge into the estimation procedure of deep learning models may be necessary in order to obtain better results in terms of predictive performance.},
  date = {2017-05-18},
  pages = {50-59},
  author = {Urda, Daniel and Montes-Torres, J and Moreno, F and Franco, Leonardo and Jerez, José},
  file = {/Users/fergalcotter/Dropbox/Papers/Urda et al_2017_Deep Learning to Analyze RNA-Seq Gene Expression Data.pdf},
  note = {00001}
}

@online{noauthor_icml_nodate,
  title = {{{ICML}} 2018},
  url = {https://www.icml.cc/Conferences/2018/Schedule?showEvent=3166},
  urldate = {2018-07-19},
  file = {/Users/fergalcotter/Zotero/storage/CU7BHKS6/Schedule.html},
  note = {00000}
}

@article{bartlett_representing_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1804.05012},
  primaryClass = {cs, math, stat},
  title = {Representing Smooth Functions as Compositions of Near-Identity Functions with Implications for Deep Network Optimization},
  url = {http://arxiv.org/abs/1804.05012},
  abstract = {We show that any smooth bi-Lipschitz \$h\$ can be represented exactly as a composition \$h\_m \textbackslash{}circ ... \textbackslash{}circ h\_1\$ of functions \$h\_1,...,h\_m\$ that are close to the identity in the sense that each \$\textbackslash{}left(h\_i-\textbackslash{}mathrm\{Id\}\textbackslash{}right)\$ is Lipschitz, and the Lipschitz constant decreases inversely with the number \$m\$ of functions composed. This implies that \$h\$ can be represented to any accuracy by a deep residual network whose nonlinear layers compute functions with a small Lipschitz constant. Next, we consider nonlinear regression with a composition of near-identity nonlinear maps. We show that, regarding Fr\textbackslash{}'echet derivatives with respect to the \$h\_1,...,h\_m\$, any critical point of a quadratic criterion in this near-identity region must be a global minimizer. In contrast, if we consider derivatives with respect to parameters of a fixed-size residual network with sigmoid activation functions, we show that there are near-identity critical points that are suboptimal, even in the realizable case. Informally, this means that functional gradient methods for residual networks cannot get stuck at suboptimal critical points corresponding to near-identity layers, whereas parametric gradient methods for sigmoidal residual networks suffer from suboptimal critical points in the near-identity region.},
  urldate = {2018-07-19},
  date = {2018-04-13},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning,Computer Science - Machine Learning,Mathematics - Statistics Theory},
  author = {Bartlett, Peter L. and Evans, Steven N. and Long, Philip M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Bartlett et al_2018_Representing smooth functions as compositions of near-identity functions with.pdf;/Users/fergalcotter/Zotero/storage/EZ48BSZX/1804.html},
  note = {00001}
}

@article{donoho_ideal_1994-1,
  eprinttype = {jstor},
  eprint = {2337118},
  title = {Ideal {{Spatial Adaptation}} by {{Wavelet Shrinkage}}},
  volume = {81},
  issn = {0006-3444},
  doi = {10.2307/2337118},
  abstract = {With ideal spatial adaptation, an oracle furnishes information about how best to adapt a spatially variable estimator, whether piecewise constant, piecewise polynomial, variable knot spline, or variable bandwidth kernel, to the unknown function. Estimation with the aid of an oracle offers dramatic advantages over traditional linear estimation by nonadaptive kernels; however, it is a priori unclear whether such performance can be obtained by a procedure relying on the data alone. We describe a new principle for spatially-adaptive estimation: selective wavelet reconstruction. We show that variable-knot spline fits and piecewise-polynomial fits, when equipped with an oracle to select the knots, are not dramatically more powerful than selective wavelet reconstruction with an oracle. We develop a practical spatially adaptive method, RiskShrink, which works by shrinkage of empirical wavelet coefficients. RiskShrink used in connection with sample rotation. Inclusion probabilities of any order can be written explicitly in closed form. Second-order inclusion probabilities πij satisfy the condition \$0 {$<$} \textbackslash{}pi\_\{ij\} {$<$} \textbackslash{}pi\_\{i\}\textbackslash{}pi\_j\$, which guarantees Yates \& Grundy's variance estimator to be unbiased, definable for all samples and always nonnegative for any sample size.},
  number = {3},
  journaltitle = {Biometrika},
  date = {1994},
  pages = {425-455},
  author = {Donoho, David L. and Johnstone, Iain M.},
  note = {10421}
}

@article{everitt_reinforcement_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.08417},
  primaryClass = {cs, stat},
  title = {Reinforcement {{Learning}} with a {{Corrupted Reward Channel}}},
  url = {http://arxiv.org/abs/1705.08417},
  abstract = {No real-world reward function is perfect. Sensory errors and software bugs may result in RL agents observing higher (or lower) rewards than they should. For example, a reinforcement learning agent may prefer states where a sensory error gives it the maximum reward, but where the true reward is actually small. We formalise this problem as a generalised Markov Decision Problem called Corrupt Reward MDP. Traditional RL methods fare poorly in CRMDPs, even under strong simplifying assumptions and when trying to compensate for the possibly corrupt rewards. Two ways around the problem are investigated. First, by giving the agent richer data, such as in inverse reinforcement learning and semi-supervised reinforcement learning, reward corruption stemming from systematic sensory errors may sometimes be completely managed. Second, by using randomisation to blunt the agent's optimisation, reward corruption can be partially managed under some assumptions.},
  urldate = {2018-07-31},
  date = {2017-05-23},
  keywords = {Computer Science - Artificial Intelligence,I.2.6,Statistics - Machine Learning,Computer Science - Machine Learning,I.2.8},
  author = {Everitt, Tom and Krakovna, Victoria and Orseau, Laurent and Hutter, Marcus and Legg, Shane},
  file = {/Users/fergalcotter/Dropbox/Papers/Everitt et al_2017_Reinforcement Learning with a Corrupted Reward Channel.pdf;/Users/fergalcotter/Zotero/storage/47S6IIPN/1705.html},
  note = {00006}
}

@article{papyan_theoretical_2018,
  title = {Theoretical {{Foundations}} of {{Deep Learning}} via {{Sparse Representations}}: {{A Multilayer Sparse Model}} and {{Its Connection}} to {{Convolutional Neural Networks}}},
  volume = {35},
  issn = {1053-5888},
  doi = {10.1109/MSP.2018.2820224},
  shorttitle = {Theoretical {{Foundations}} of {{Deep Learning}} via {{Sparse Representations}}},
  abstract = {Modeling data is the way we-scientists-believe that information should be explained and handled. Indeed, models play a central role in practically every task in signal and image processing and machine learning. Sparse representation theory (we shall refer to it as Sparseland) puts forward an emerging, highly effective, and universal model. Its core idea is the description of data as a linear combination of few atoms taken from a dictionary of such fundamental elements.},
  number = {4},
  journaltitle = {IEEE Signal Processing Magazine},
  date = {2018-07},
  pages = {72-89},
  keywords = {Convolution,Mathematical model,image processing,machine learning,signal processing,deep learning,convolutional neural networks,Image coding,Big Data,data description,Data models,Data science,Machine learning,modeling data,multilayer sparse model,Sparse matrices,sparse representation theory,Task analysis,universal model},
  author = {Papyan, V. and Romano, Y. and Sulam, J. and Elad, M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Papyan et al. - 2018 - Theoretical Foundations of Deep Learning via Spars.pdf;/Users/fergalcotter/Zotero/storage/XE9JLCQ2/8398588.html}
}

@article{friedman_exploratory_1987,
  eprinttype = {jstor},
  eprint = {2289161},
  title = {Exploratory {{Projection Pursuit}}},
  volume = {82},
  issn = {0162-1459},
  doi = {10.2307/2289161},
  abstract = {A new projection pursuit algorithm for exploring multivariate data is presented that has both statistical and computational advantages over previous methods. A number of practical issues concerning its application are addressed. A connection to multivariate density estimation is established, and its properties are investigated through simulation studies and application to real data. The goal of exploratory projection pursuit is to use the data to find low- (one-, two-, or three-) dimensional projections that provide the most revealing views of the full-dimensional data. With these views the human gift for pattern recognition can be applied to help discover effects that may not have been anticipated in advance. Since linear effects are directly captured by the covariance structure of the variable pairs (which are straightforward to estimate) the emphasis here is on the discovery of nonlinear effects such as clustering or other general nonlinear associations among the variables. Although arbitrary nonlinear effects are impossible to parameterize in full generality, they are easily recognized when presented in a low-dimensional visual representation of the data density. Projection pursuit assigns a numerical index to every projection that is a functional of the projected data density. The intent of this index is to capture the degree of nonlinear structuring present in the projected distribution. The pursuit consists of maximizing this index with respect to the parameters defining the projection. Since it is unlikely that there is only one interesting view of a multivariate data set, this procedure is iterated to find further revealing projections. After each maximizing projection has been found, a transformation is applied to the data that removes the structure present in the solution projection while preserving the multivariate structure that is not captured by it. The projection pursuit algorithm is then applied to these transformed data to find additional views that may yield further insight. This projection pursuit algorithm has potential advantages over other dimensionality reduction methods that are commonly used for data exploration. It focuses directly on the "interestingness" of a projection rather than indirectly through the interpoint distances. This allows it to be unaffected by the scale and (linear) correlational structure of the data, helping it to overcome the "curse of dimensionality" that tends to plague methods based on multidimensional scaling, parametric mapping, cluster analysis, and principal components.},
  number = {397},
  journaltitle = {Journal of the American Statistical Association},
  date = {1987},
  pages = {249-266},
  author = {Friedman, Jerome H.},
  file = {/Users/fergalcotter/Dropbox/Papers/Friedman - 1987 - Exploratory Projection Pursuit.pdf},
  note = {01079}
}

@book{starck_sparse_2015,
  title = {Sparse {{Image}} and {{Signal Processing}}: {{Wavelets}} and {{Related Geometric Multiscale Analysis}}},
  edition = {2},
  isbn = {978-1-107-08806-1},
  url = {http://gen.lib.rus.ec/book/index.php?md5=64de935ca318796bcab3c6bf42bab908},
  shorttitle = {Sparse {{Image}} and {{Signal Processing}}},
  publisher = {{Cambridge University Press}},
  urldate = {2018-08-14},
  date = {2015},
  author = {Starck, Jean-Luc and Murtagh, Fionn and Fadili, Jalal},
  note = {00032}
}

@book{petrou_image_nodate,
  langid = {american},
  title = {Image {{Processing}}: {{The Fundamentals}}},
  edition = {2},
  isbn = {978-0-470-74586-1},
  url = {https://www.wiley.com/en-us/Image+Processing%3A+The+Fundamentals%2C+2nd+Edition-p-9780470745861},
  shorttitle = {Image {{Processing}}: {{The Fundamentals}}},
  abstract = {Following the success of the first edition, this thoroughly updated  second edition of Image Processing: The Fundamentals will  ensure that it remains the ideal text for anyone seeking an  introduction to the essential concepts of image processing. New  material includes image processing and colour, sine and cosine  transforms, Independent Component Analysis (ICA), phase congruency  and the monogenic signal and several other new topics. These  updates are combined with coverage of classic topics in image  processing, such as orthogonal transforms and image enhancement,  making this a truly comprehensive text on the subject.       Key features:    Presents material at two levels of difficulty: the main text  addresses the fundamental concepts and presents a broad view of  image processing, whilst more advanced material is interleaved in  boxes throughout the text, providing further reference for those  who wish to examine each technique in depth.  Contains a large number of fully worked out examples.  Focuses on an understanding of how image processing methods  work in practice.  Illustrates complex algorithms on a step-by-step basis, and  lists not only the good practices but also identifies the pitfalls  in each case.  Uses a clear question and answer structure.  Includes a CD containing the MATLAB® code of the various  examples and algorithms presented in the book. There is also an  accompanying website with slides available for download for  instructors as a teaching resource.    Image Processing: The Fundamentals, Second Edition is an  ideal teaching resource for both undergraduate and postgraduate  students. It will also be of value to researchers of various  disciplines from medicine to mathematics with a professional  interest in image processing},
  publisher = {{Wiley}},
  urldate = {2018-08-14},
  author = {Petrou, Maria and Petrou, Costas},
  file = {/Users/fergalcotter/Dropbox/Papers/Petrou and Petrou - Image Processing The Fundamentals.pdf},
  note = {00000}
}

@article{zhang_understanding_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1611.03530},
  primaryClass = {cs},
  title = {Understanding Deep Learning Requires Rethinking Generalization},
  url = {http://arxiv.org/abs/1611.03530},
  abstract = {Despite their massive size, successful deep artificial neural networks can exhibit a remarkably small difference between training and test performance. Conventional wisdom attributes small generalization error either to properties of the model family, or to the regularization techniques used during training. Through extensive systematic experiments, we show how these traditional approaches fail to explain why large neural networks generalize well in practice. Specifically, our experiments establish that state-of-the-art convolutional networks for image classification trained with stochastic gradient methods easily fit a random labeling of the training data. This phenomenon is qualitatively unaffected by explicit regularization, and occurs even if we replace the true images by completely unstructured random noise. We corroborate these experimental findings with a theoretical construction showing that simple depth two neural networks already have perfect finite sample expressivity as soon as the number of parameters exceeds the number of data points as it usually does in practice. We interpret our experimental findings by comparison with traditional models.},
  urldate = {2018-08-13},
  date = {2016-11-10},
  keywords = {Computer Science - Machine Learning},
  author = {Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht, Benjamin and Vinyals, Oriol},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhang et al_2016_Understanding deep learning requires rethinking generalization.pdf;/Users/fergalcotter/Zotero/storage/6QSKPHMA/1611.html},
  note = {00441}
}

@article{jacobsen_multiscale_2017-2,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1703.04140},
  primaryClass = {cs, stat},
  title = {Multiscale {{Hierarchical Convolutional Networks}}},
  url = {http://arxiv.org/abs/1703.04140},
  abstract = {Deep neural network algorithms are difficult to analyze because they lack structure allowing to understand the properties of underlying transforms and invariants. Multiscale hierarchical convolutional networks are structured deep convolutional networks where layers are indexed by progressively higher dimensional attributes, which are learned from training data. Each new layer is computed with multidimensional convolutions along spatial and attribute variables. We introduce an efficient implementation of such networks where the dimensionality is progressively reduced by averaging intermediate layers along attribute indices. Hierarchical networks are tested on CIFAR image data bases where they obtain comparable precisions to state of the art networks, with much fewer parameters. We study some properties of the attributes learned from these databases.},
  urldate = {2018-08-09},
  date = {2017-03-12},
  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
  author = {Jacobsen, Jörn-Henrik and Oyallon, Edouard and Mallat, Stéphane and Smeulders, Arnold W. M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Jacobsen et al_2017_Multiscale Hierarchical Convolutional Networks2.pdf;/Users/fergalcotter/Zotero/storage/YX3ZK57Q/1703.html},
  note = {00004}
}

@article{lee_machine_2018,
  langid = {english},
  title = {A Machine Learning Approach to Integrate Big Data for Precision Medicine in Acute Myeloid Leukemia},
  volume = {9},
  issn = {2041-1723},
  url = {https://www.nature.com/articles/s41467-017-02465-5},
  doi = {10.1038/s41467-017-02465-5},
  abstract = {Identification of markers of drug response is essential for precision therapy. Here the authors introduce an algorithm that uses prior information about each gene’s importance in AML to identify the most predictive gene-drug associations from transcriptome and drug response data from 30 AML samples.},
  number = {1},
  journaltitle = {Nature Communications},
  urldate = {2018-08-08},
  date = {2018-01-03},
  pages = {42},
  author = {Lee, Su-In and Celik, Safiye and Logsdon, Benjamin A. and Lundberg, Scott M. and Martins, Timothy J. and Oehler, Vivian G. and Estey, Elihu H. and Miller, Chris P. and Chien, Sylvia and Dai, Jin and Saxena, Akanksha and Blau, C. Anthony and Becker, Pamela S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Lee et al_2018_A machine learning approach to integrate big data for precision medicine in.pdf;/Users/fergalcotter/Zotero/storage/IDUFLKU3/s41467-017-02465-5.html},
  note = {00004}
}

@book{kovacevic_introduction_2008,
  location = {{Hanover, MA, USA}},
  title = {An {{Introduction}} to {{Frames}}},
  isbn = {978-1-60198-068-7},
  abstract = {An Introduction to Frames is an introduction to redundant signal representations called frames. These representations have recently emerged as yet another powerful tool in the signal processing toolbox, spurred by a host of recent applications requiring some level of redundancy. It asks the question: Why and where should one use frames? And answers emphatically: Anywhere where redundancy is a must. It then goes on to discuss a host of applications that richly illustrate that answer.An Introduction to Frames is geared primarily toward engineering students and those without extensive mathematical training. It is also intended to help researchers and practitioners decide whether frames are the right tool for their application.},
  publisher = {{Now Publishers Inc.}},
  date = {2008},
  author = {Kovacevic, Jelena and Chebira, Amina},
  file = {/Users/fergalcotter/Dropbox/Papers/Kovacevic and Chebira - 2008 - An Introduction to Frames.pdf},
  note = {00117}
}

@book{boyd_convex_2004,
  location = {{New York, NY, USA}},
  title = {Convex {{Optimization}}},
  isbn = {978-0-521-83378-3},
  publisher = {{Cambridge University Press}},
  date = {2004},
  author = {Boyd, Stephen and Vandenberghe, Lieven},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/Boyd and Vandenberghe - 2004 - Convex Optimization.pdf},
  note = {41371}
}

@book{w.h_numerical_1997,
  title = {Numerical Recipes in {{C}}: The Art of Scientific Computing},
  edition = {2nd ed},
  isbn = {978-0-521-43108-8},
  url = {http://gen.lib.rus.ec/book/index.php?md5=1141B9EEAEBB4D6CC1845B83F4464768},
  shorttitle = {Numerical Recipes in {{C}}},
  publisher = {{Cambridge University Press}},
  urldate = {2018-09-10},
  date = {1997},
  author = {W.H, Press and S.A, Teukolsky and W.T, Vetterling and B.P, Flannery},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/W.H et al. - 1997 - Numerical recipes in C the art of scientific comp.pdf},
  note = {00870}
}

@book{petrou_image_2010,
  title = {Image {{Processing}}. {{The Fundamentals}} 2nd},
  edition = {2nd},
  isbn = {978-0-470-74586-1},
  url = {http://gen.lib.rus.ec/book/index.php?md5=b2e6be81172d7c97291a70b5f181cf96},
  publisher = {{John Wiley \& sons}},
  urldate = {2018-09-10},
  date = {2010},
  author = {Petrou, Maria and Petrou, Costas},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/Petrou and Petrou - 2010 - Image Processing. The Fundamentals 2nd.pdf},
  note = {00000}
}

@book{solomon_numerical_2015,
  title = {Numerical {{Algorithms}}: {{Methods}} for {{Computer Vision}}, {{Machine Learning}}, and {{Graphics}}},
  isbn = {978-1-4822-5188-3},
  url = {http://gen.lib.rus.ec/book/index.php?md5=fe2334473cf6e8596c53f9356bc26203},
  shorttitle = {Numerical {{Algorithms}}},
  publisher = {{A K Peters/CRC Press}},
  urldate = {2018-09-10},
  date = {2015},
  author = {Solomon, Justin},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/Solomon - 2015 - Numerical Algorithms Methods for Computer Vision,.pdf},
  note = {00015}
}

@book{vetterli_wavelets_2007,
  title = {Wavelets and {{Subband Coding}}},
  edition = {2},
  series = {Prentice {{Hall}} Signal Processing Series},
  publisher = {{Prentice Hall PTR}},
  date = {2007},
  author = {Vetterli, Martin and Kovacevic, Jelena},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/Vetterli and Kovacevic - 1995 - Wavelets and Subband Coding.pdf},
  note = {00972}
}

@article{johnson_adjusting_2007,
  langid = {english},
  title = {Adjusting Batch Effects in Microarray Expression Data Using Empirical {{Bayes}} Methods},
  volume = {8},
  issn = {1465-4644},
  url = {https://academic.oup.com/biostatistics/article/8/1/118/252073},
  doi = {10.1093/biostatistics/kxj037},
  abstract = {Abstract.  Non-biological experimental variation or “batch effects" are commonly observed across multiple batches of microarray experiments, often rendering the},
  number = {1},
  journaltitle = {Biostatistics},
  shortjournal = {Biostatistics},
  urldate = {2018-08-29},
  date = {2007-01-01},
  pages = {118-127},
  author = {Johnson, W. Evan and Li, Cheng and Rabinovic, Ariel},
  file = {/Users/fergalcotter/Dropbox/Papers/Johnson et al_2007_Adjusting batch effects in microarray expression data using empirical Bayes.pdf;/Users/fergalcotter/Zotero/storage/CGHU63SE/252073.html},
  note = {02450}
}

@online{noauthor_bayes_nodate,
  title = {Bayes {{Shrink}}},
  url = {http://localhost:10000/notebooks/summer2018/Bayes%20Shrink.ipynb},
  urldate = {2018-09-18},
  file = {/Users/fergalcotter/Zotero/storage/FN3BZ83A/Bayes Shrink.html},
  note = {00008}
}

@article{williams_wavelet_2018,
  title = {Wavelet {{Pooling}} for {{Convolutional Neural Networks}}},
  url = {https://openreview.net/forum?id=rkhlb8lCZ},
  abstract = {Convolutional Neural Networks continuously advance the progress of 2D and 3D image and object classification. The steadfast usage of this algorithm requires constant evaluation and upgrading of...},
  urldate = {2018-09-27},
  date = {2018-02-15},
  author = {Williams, Travis and Li, Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Williams_Li_2018_Wavelet Pooling for Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/CI5PPZHI/forum.html},
  note = {00001}
}

@article{williams_ensemble_2018,
  langid = {english},
  title = {An {{Ensemble}} of {{Convolutional Neural Networks Using Wavelets}} for {{Image Classification}}},
  volume = {11},
  url = {http://www.scirp.org/journal/PaperInformation.aspx?PaperID=82297&#abstract},
  doi = {10.4236/jsea.2018.112004},
  abstract = {Machine learning is an integral technology many people utilize in all areas of human life. It is pervasive in modern living worldwide, and has multiple usages. One application is image classification, embraced across many spheres of influence such as business, finance, medicine, etc. to enhance produces, causes, efficiency, etc. This need for more accurate, detail-oriented classification increases the need for modifications, adaptations, and innovations to Deep Learning Algorithms. This article used Convolutional Neural Networks (CNN) to classify scenes in the CIFAR-10 database, and detect emotions in the KDEF database. The proposed method converted the data to the wavelet domain to attain greater accuracy and comparable efficiency to the spatial domain processing. By dividing image data into subbands, important feature learning occurred over differing low to high frequencies. The combination of the learned low and high frequency features, and processing the fused feature mapping resulted in an advance in the detection accuracy. Comparing the proposed methods to spatial domain CNN and Stacked Denoising Autoencoder (SDA), experimental findings revealed a substantial increase in accuracy.},
  journaltitle = {Journal of Software Engineering and Applications},
  urldate = {2018-09-27},
  date = {2018-02-05},
  pages = {69},
  author = {Williams, Travis and Li, Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Williams_Li_2018_An Ensemble of Convolutional Neural Networks Using Wavelets for Image.pdf;/Users/fergalcotter/Zotero/storage/E3RJIBJT/PaperInformation.html},
  note = {00000}
}

@article{fujieda_wavelet_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1805.08620},
  primaryClass = {cs},
  title = {Wavelet {{Convolutional Neural Networks}}},
  url = {http://arxiv.org/abs/1805.08620},
  abstract = {Spatial and spectral approaches are two major approaches for image processing tasks such as image classification and object recognition. Among many such algorithms, convolutional neural networks (CNNs) have recently achieved significant performance improvement in many challenging tasks. Since CNNs process images directly in the spatial domain, they are essentially spatial approaches. Given that spatial and spectral approaches are known to have different characteristics, it will be interesting to incorporate a spectral approach into CNNs. We propose a novel CNN architecture, wavelet CNNs, which combines a multiresolution analysis and CNNs into one model. Our insight is that a CNN can be viewed as a limited form of a multiresolution analysis. Based on this insight, we supplement missing parts of the multiresolution analysis via wavelet transform and integrate them as additional components in the entire architecture. Wavelet CNNs allow us to utilize spectral information which is mostly lost in conventional CNNs but useful in most image processing tasks. We evaluate the practical performance of wavelet CNNs on texture classification and image annotation. The experiments show that wavelet CNNs can achieve better accuracy in both tasks than existing models while having significantly fewer parameters than conventional CNNs.},
  urldate = {2018-09-27},
  date = {2018-05-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  author = {Fujieda, Shin and Takayama, Kohei and Hachisuka, Toshiya},
  file = {/Users/fergalcotter/Dropbox/Papers/Fujieda et al_2018_Wavelet Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/HRRUKLDC/1805.html},
  note = {00007}
}

@report{noauthor_notitle_nodate,
  note = {00000}
}

@report{tantau_tikz_2013,
  title = {Tikz},
  date = {2013},
  author = {Tantau, Till},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/Tikz.pdf},
  note = {00086}
}

@article{ma_detailed_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1808.01834},
  primaryClass = {cs},
  title = {Detailed {{Dense Inference}} with {{Convolutional Neural Networks}} via {{Discrete Wavelet Transform}}},
  url = {http://arxiv.org/abs/1808.01834},
  abstract = {Dense pixelwise prediction such as semantic segmentation is an up-to-date challenge for deep convolutional neural networks (CNNs). Many state-of-the-art approaches either tackle the loss of high-resolution information due to pooling in the encoder stage, or use dilated convolutions or high-resolution lanes to maintain detailed feature maps and predictions. Motivated by the structural analogy between multi-resolution wavelet analysis and the pooling/unpooling layers of CNNs, we introduce discrete wavelet transform (DWT) into the CNN encoder-decoder architecture and propose WCNN. The high-frequency wavelet coefficients are computed at encoder, which are later used at the decoder to unpooled jointly with coarse-resolution feature maps through the inverse DWT. The DWT/iDWT is further used to develop two wavelet pyramids to capture the global context, where the multi-resolution DWT is applied to successively reduce the spatial resolution and increase the receptive field. Experiment with the Cityscape dataset, the proposed WCNNs are computationally efficient and yield improvements the accuracy for high-resolution dense pixelwise prediction.},
  urldate = {2018-10-08},
  date = {2018-08-06},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Ma, Lingni and Stückler, Jörg and Wu, Tao and Cremers, Daniel},
  file = {/Users/fergalcotter/Dropbox/Papers/Ma et al_2018_Detailed Dense Inference with Convolutional Neural Networks via Discrete.pdf;/Users/fergalcotter/Zotero/storage/NACJK6T9/1808.html},
  note = {00000}
}

@inproceedings{guo_deep_2017,
  location = {{Honolulu, HI, USA}},
  title = {Deep {{Wavelet Prediction}} for {{Image Super}}-{{Resolution}}},
  isbn = {978-1-5386-0733-6},
  url = {http://ieeexplore.ieee.org/document/8014882/},
  doi = {10.1109/CVPRW.2017.148},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2017 {{IEEE Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2018-10-08},
  date = {2017-07},
  pages = {1100-1109},
  author = {Guo, Tiantong and Mousavi, Hojjat Seyed and Vu, Tiep Huu and Monga, Vishal},
  file = {/Users/fergalcotter/Dropbox/Papers/Guo et al. - 2017 - Deep Wavelet Prediction for Image Super-Resolution.pdf},
  note = {00010}
}

@article{barina_accelerating_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.08266},
  primaryClass = {cs},
  title = {Accelerating {{Discrete Wavelet Transforms}} on {{GPUs}}},
  url = {http://arxiv.org/abs/1705.08266},
  abstract = {The two-dimensional discrete wavelet transform has a huge number of applications in image-processing techniques. Until now, several papers compared the performance of such transform on graphics processing units (GPUs). However, all of them only dealt with lifting and convolution computation schemes. In this paper, we show that corresponding horizontal and vertical lifting parts of the lifting scheme can be merged into non-separable lifting units, which halves the number of steps. We also discuss an optimization strategy leading to a reduction in the number of arithmetic operations. The schemes were assessed using the OpenCL and pixel shaders. The proposed non-separable lifting scheme outperforms the existing schemes in many cases, irrespective of its higher complexity.},
  urldate = {2018-10-08},
  date = {2017-05-18},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Multimedia},
  author = {Barina, David and Kula, Michal and Matysek, Michal and Zemcik, Pavel},
  file = {/Users/fergalcotter/Dropbox/Papers/Barina et al_2017_Accelerating Discrete Wavelet Transforms on GPUs.pdf;/Users/fergalcotter/Zotero/storage/GCK88HMC/1705.html},
  note = {00000}
}

@inproceedings{singh_scatternet_2017,
  location = {{Tokyo, Japan}},
  title = {Scatternet Hybrid Deep Learning ({{SHDL}}) Network for Object Classification},
  doi = {10.1109/MLSP.2017.8168141},
  abstract = {The paper proposes the ScatterNet Hybrid Deep Learning (SHDL) network that extracts invariant and discriminative image representations for object recognition. SHDL framework is constructed with a multi-layer ScatterNet front-end, an unsupervised learning middle, and a supervised learning back-end module. Each layer of the SHDL network is automatically designed as an explicit optimization problem leading to an optimal deep learning architecture with improved computational performance as compared to the more usual deep network architectures. SHDL network produces the state-of-the-art classification performance against unsupervised and semi-supervised learning (GANs) on two image datasets. Advantages of the SHDL network over supervised methods (NIN, VGG) are also demonstrated with experiments performed on training datasets of reduced size.},
  eventtitle = {2017 {{IEEE International Workshop}} on {{Machine Learning}} for {{Signal Processing}} ({{MLSP}})},
  booktitle = {Proceedings of 2017 {{IEEE International Workshop}} on {{Machine Learning}} for {{Signal Processing}} ({{MLSP}})},
  publisher = {{IEEE}},
  date = {2017-09},
  keywords = {Computer architecture,convolutional neural network,Convolutional neural network,Deep architecture design,explicit optimization problem,feature extraction,Feature extraction,image classification,image representation,image representations,multilayer ScatterNet front-end,neural net architecture,object classification,object recognition,optimisation,Optimization,Personal area networks,ScatterNet,scatternet hybrid deep learning network,semisupervised learning,SHDL network,Supervised learning,Training,unsupervised learning,Unsupervised learning},
  author = {Singh, A. and Kingsbury, N.},
  file = {/Users/fergalcotter/Zotero/storage/DMLVQJPQ/Singh and Kingsbury - 2017 - Scatternet hybrid deep learning (SHDL) network for.pdf;/Users/fergalcotter/Zotero/storage/89ESJAZ5/8168141.html},
  note = {00006}
}

@software{-_wavelenet_2018,
  title = {{{WaveLeNet}} vs {{LeNet}}},
  date = {2018},
  author = {-, -},
  note = {00000}
}

@article{leung_complex_1991,
  title = {The Complex Backpropagation Algorithm},
  volume = {39},
  issn = {1053-587X},
  doi = {10.1109/78.134446},
  abstract = {The backpropagation (BP) algorithm that provides a popular method for the design of a multilayer neural network to include complex coefficients and complex signals so that it can be applied to general radar signal processing and communications problems. It is shown that the network can classify complex signals. The generalization of the BP to deal with complex signals should make it possible to expand the line of applications of this powerful nonlinear signal processing algorithm.{$<$}{$>$}},
  number = {9},
  journaltitle = {IEEE Transactions on Signal Processing},
  date = {1991-09},
  pages = {2101-2104},
  keywords = {Multi-layer neural network,signal processing,Signal processing algorithms,Backpropagation algorithms,Circuits,Least squares approximation,neural nets,design,Adaptive signal processing,Array signal processing,complex backpropagation algorithm,complex coefficients,complex signals,Matrix decomposition,multilayer neural network,Noise cancellation,nonlinear signal processing algorithm,radar signal processing,radar theory,Speech processing},
  author = {Leung, H. and Haykin, S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Leung_Haykin_1991_The complex backpropagation algorithm.pdf;/Users/fergalcotter/Zotero/storage/Z7WJA4DV/134446.html},
  note = {00358}
}

@article{cortes_support-vector_1995,
  title = {Support-Vector Networks},
  volume = {20},
  issn = {1573-0565},
  url = {https://doi.org/10.1007/BF00994018},
  doi = {10.1007/BF00994018},
  abstract = {Thesupport-vector network is a new learning machine for two-group classification problems. The machine conceptually implements the following idea: input vectors are non-linearly mapped to a very high-dimension feature space. In this feature space a linear decision surface is constructed. Special properties of the decision surface ensures high generalization ability of the learning machine. The idea behind the support-vector network was previously implemented for the restricted case where the training data can be separated without errors. We here extend this result to non-separable training data.High generalization ability of support-vector networks utilizing polynomial input transformations is demonstrated. We also compare the performance of the support-vector network to various classical learning algorithms that all took part in a benchmark study of Optical Character Recognition.},
  number = {3},
  journaltitle = {Machine Learning},
  shortjournal = {Mach Learn},
  urldate = {2018-10-31},
  date = {1995-09-01},
  pages = {273-297},
  keywords = {efficient learning algorithms,neural networks,pattern recognition,polynomial classifiers,radial basis function classifiers},
  author = {Cortes, Corinna and Vapnik, Vladimir},
  file = {/Users/fergalcotter/Dropbox/Papers/Cortes_Vapnik_1995_Support-vector networks.pdf},
  note = {32292}
}

@book{jensen_bayesian_2007,
  title = {Bayesian Networks and Decision Graphs},
  edition = {2nd},
  isbn = {978-0-387-68281-5},
  url = {http://gen.lib.rus.ec/book/index.php?md5=221D554B6699BF082377771696B32A6E},
  series = {Information {{Science}} and {{Statistics}}},
  publisher = {{Springer}},
  urldate = {2018-11-01},
  date = {2007},
  author = {Jensen, Finn B. and Graven-Nielsen, Thomas},
  file = {/Users/fergalcotter/Dropbox/Papers/Jensen and Graven-Nielsen - 2007 - Bayesian networks and decision graphs.pdf},
  note = {05502}
}

@article{ardizzone_analyzing_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1808.04730},
  primaryClass = {cs, stat},
  title = {Analyzing {{Inverse Problems}} with {{Invertible Neural Networks}}},
  url = {http://arxiv.org/abs/1808.04730},
  abstract = {In many tasks, in particular in natural science, the goal is to determine hidden system parameters from a set of measurements. Often, the forward process from parameter- to measurement-space is a well-defined function, whereas the inverse problem is ambiguous: one measurement may map to multiple different sets of parameters. In this setting, the posterior parameter distribution, conditioned on an input measurement, has to be determined. We argue that a particular class of neural networks is well suited for this task -- so-called Invertible Neural Networks (INNs). Although INNs are not new, they have, so far, received little attention in literature. While classical neural networks attempt to solve the ambiguous inverse problem directly, INNs are able to learn it jointly with the well-defined forward process, using additional latent output variables to capture the information otherwise lost. Given a specific measurement and sampled latent variables, the inverse pass of the INN provides a full distribution over parameter space. We verify experimentally, on artificial data and real-world problems from astrophysics and medicine, that INNs are a powerful analysis tool to find multi-modalities in parameter space, to uncover parameter correlations, and to identify unrecoverable parameters.},
  urldate = {2018-11-09},
  date = {2018-08-14},
  keywords = {68T01,Statistics - Machine Learning,Computer Science - Machine Learning},
  author = {Ardizzone, Lynton and Kruse, Jakob and Wirkert, Sebastian and Rahner, Daniel and Pellegrini, Eric W. and Klessen, Ralf S. and Maier-Hein, Lena and Rother, Carsten and Köthe, Ullrich},
  file = {/Users/fergalcotter/Dropbox/Papers/Ardizzone et al_2018_Analyzing Inverse Problems with Invertible Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/I3DNU9W9/1808.html},
  note = {00000}
}

@article{chang_reversible_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1709.03698},
  primaryClass = {cs, stat},
  title = {Reversible {{Architectures}} for {{Arbitrarily Deep Residual Neural Networks}}},
  url = {http://arxiv.org/abs/1709.03698},
  abstract = {Recently, deep residual networks have been successfully applied in many computer vision and natural language processing tasks, pushing the state-of-the-art performance with deeper and wider architectures. In this work, we interpret deep residual networks as ordinary differential equations (ODEs), which have long been studied in mathematics and physics with rich theoretical and empirical success. From this interpretation, we develop a theoretical framework on stability and reversibility of deep neural networks, and derive three reversible neural network architectures that can go arbitrarily deep in theory. The reversibility property allows a memory-efficient implementation, which does not need to store the activations for most hidden layers. Together with the stability of our architectures, this enables training deeper networks using only modest computational resources. We provide both theoretical analyses and empirical results. Experimental results demonstrate the efficacy of our architectures against several strong baselines on CIFAR-10, CIFAR-100 and STL-10 with superior or on-par state-of-the-art performance. Furthermore, we show our architectures yield superior results when trained using fewer training data.},
  urldate = {2018-11-09},
  date = {2017-09-12},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Statistics - Machine Learning},
  author = {Chang, Bo and Meng, Lili and Haber, Eldad and Ruthotto, Lars and Begert, David and Holtham, Elliot},
  file = {/Users/fergalcotter/Dropbox/Papers/Chang et al_2017_Reversible Architectures for Arbitrarily Deep Residual Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/S9YHKNUL/1709.html},
  note = {00018}
}

@article{jacobsen_i-revnet:_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1802.07088},
  primaryClass = {cs, stat},
  title = {I-{{RevNet}}: {{Deep Invertible Networks}}},
  url = {http://arxiv.org/abs/1802.07088},
  shorttitle = {I-{{RevNet}}},
  abstract = {It is widely believed that the success of deep convolutional networks is based on progressively discarding uninformative variability about the input with respect to the problem at hand. This is supported empirically by the difficulty of recovering images from their hidden representations, in most commonly used network architectures. In this paper we show via a one-to-one mapping that this loss of information is not a necessary condition to learn representations that generalize well on complicated problems, such as ImageNet. Via a cascade of homeomorphic layers, we build the i-RevNet, a network that can be fully inverted up to the final projection onto the classes, i.e. no information is discarded. Building an invertible architecture is difficult, for one, because the local inversion is ill-conditioned, we overcome this by providing an explicit inverse. An analysis of i-RevNets learned representations suggests an alternative explanation for the success of deep networks by a progressive contraction and linear separation with depth. To shed light on the nature of the model learned by the i-RevNet we reconstruct linear interpolations between natural image representations.},
  urldate = {2018-11-09},
  date = {2018-02-20},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Statistics - Machine Learning,Computer Science - Machine Learning},
  author = {Jacobsen, Jörn-Henrik and Smeulders, Arnold and Oyallon, Edouard},
  file = {/Users/fergalcotter/Dropbox/Papers/Jacobsen et al_2018_i-RevNet.pdf;/Users/fergalcotter/Zotero/storage/KXIU4XZB/1802.html},
  note = {00007}
}

@inproceedings{singh_generative_2018,
  archivePrefix = {arXiv},
%  eprinttype = {arxiv},
%  eprint = {1802.03374},
  location = {{Calgary, Canada}},
  title = {Generative {{ScatterNet Hybrid Deep Learning}} ({{G}}-{{SHDL}}) {{Network}} with {{Structural Priors}} for {{Semantic Image Segmentation}}},
  url = {http://arxiv.org/abs/1802.03374},
  abstract = {This paper proposes a generative ScatterNet hybrid deep learning (G-SHDL) network for semantic image segmentation. The proposed generative architecture is able to train rapidly from relatively small labeled datasets using the introduced structural priors. In addition, the number of filters in each layer of the architecture is optimized resulting in a computationally efficient architecture. The G-SHDL network produces state-of-the-art classification performance against unsupervised and semi-supervised learning on two image datasets. Advantages of the G-SHDL network over supervised methods are demonstrated with experiments performed on training datasets of reduced size.},
  eventtitle = {2018 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
  booktitle = {Proceedings of 2018 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
  publisher = {{IEEE}},
  urldate = {2018-11-07},
  date = {2018},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Singh, Amarjot and Kingsbury, Nick},
  file = {/Users/fergalcotter/Dropbox/Papers/Singh_Kingsbury_2018_Generative ScatterNet Hybrid Deep Learning (G-SHDL) Network with Structural.pdf;/Users/fergalcotter/Zotero/storage/YHA7DZ79/1802.html},
  note = {00001}
}

@article{reshef_detecting_2011,
  langid = {english},
  title = {Detecting {{Novel Associations}} in {{Large Data Sets}}},
  volume = {334},
  issn = {0036-8075, 1095-9203},
  url = {http://science.sciencemag.org/content/334/6062/1518},
  doi = {10.1126/science.1205438},
  abstract = {Identifying interesting relationships between pairs of variables in large data sets is increasingly important. Here, we present a measure of dependence for two-variable relationships: the maximal information coefficient (MIC). MIC captures a wide range of associations both functional and not, and for functional relationships provides a score that roughly equals the coefficient of determination (R2) of the data relative to the regression function. MIC belongs to a larger class of maximal information-based nonparametric exploration (MINE) statistics for identifying and classifying relationships. We apply MIC and MINE to data sets in global health, gene expression, major-league baseball, and the human gut microbiota and identify known and novel relationships.
A statistical method reveals relationships among variables in complex data sets.
A statistical method reveals relationships among variables in complex data sets.},
  number = {6062},
  journaltitle = {Science},
  urldate = {2018-11-02},
  date = {2011-12-16},
  pages = {1518-1524},
  author = {Reshef, David N. and Reshef, Yakir A. and Finucane, Hilary K. and Grossman, Sharon R. and McVean, Gilean and Turnbaugh, Peter J. and Lander, Eric S. and Mitzenmacher, Michael and Sabeti, Pardis C.},
  file = {/Users/fergalcotter/Dropbox/Papers/Reshef et al_2011_Detecting Novel Associations in Large Data Sets.pdf;/Users/fergalcotter/Zotero/storage/WTZ2QKPQ/1518.html},
  eprinttype = {pmid},
  eprint = {22174245},
  note = {01138}
}

@software{cotter_dtcwt_2018,
  %location = {{GitHub fbcotter/dtcwt\_gainlayer}},
  title = {{{DTCWT Gainlayer Software}}},
  language = {Python},
  url = {https://github.com/fbcotter/dtcwt_gainlayer/releases/tag/arxiv},
  date = {2018-11-14},
  author = {Cotter, Fergal},
  doi = {10.5281/zenodo.1488280}
}

@thesis{oyallon_analyzing_2017,
  location = {{Paris, France}},
  title = {Analyzing and {{Introducing Structures}} in {{Deep Convolutional Neural Networks}}},
  institution = {{Ecole Polytechnique}},
  type = {PhD Thesis},
  date = {2017-10},
  author = {Oyallon, Edouard},
  file = {/Users/fergalcotter/Dropbox/Papers/Oyallon - Analyzing and Introducing Structures in Deep Convo.pdf},
  note = {00000}
}

@article{huang_machine_2018,
  title = {Machine Learning Predicts Individual Cancer Patient Responses to Therapeutic Drugs with High Accuracy},
  volume = {8},
  issn = {2045-2322},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6219522/},
  doi = {10.1038/s41598-018-34753-5},
  abstract = {Precision or personalized cancer medicine is a clinical approach that strives to customize therapies based upon the genomic profiles of individual patient tumors. Machine learning (ML) is a computational method particularly suited to the establishment of predictive models of drug response based on genomic profiles of targeted cells. We report here on the application of our previously established open-source support vector machine (SVM)-based algorithm to predict the responses of 175 individual cancer patients to a variety of standard-of-care chemotherapeutic drugs from the gene-expression profiles (RNA-seq or microarray) of individual patient tumors. The models were found to predict patient responses with {$>$}80\% accuracy. The high PPV of our algorithms across multiple drugs suggests a potential clinical utility of our approach, particularly with respect to the identification of promising second-line treatments for patients failing standard-of-care first-line therapies.},
  journaltitle = {Scientific Reports},
  shortjournal = {Sci Rep},
  urldate = {2018-11-28},
  date = {2018-11-06},
  author = {Huang, Cai and Clayton, Evan A. and Matyunina, Lilya V. and McDonald, L. DeEtte and Benigno, Benedict B. and Vannberg, Fredrik and McDonald, John F.},
  file = {/Users/fergalcotter/Dropbox/Papers/Huang et al_2018_Machine learning predicts individual cancer patient responses to therapeutic.pdf},
  eprinttype = {pmid},
  eprint = {30401894},
  pmcid = {PMC6219522},
  note = {00000}
}

@article{rubinstein_dictionaries_2010-1,
  title = {Dictionaries for {{Sparse Representation Modeling}}},
  volume = {98},
  issn = {0018-9219},
  doi = {10.1109/JPROC.2010.2040551},
  abstract = {Sparse and redundant representation modeling of data assumes an ability to describe signals as linear combinations of a few atoms from a pre-specified dictionary. As such, the choice of the dictionary that sparsifies the signals is crucial for the success of this model. In general, the choice of a proper dictionary can be done using one of two ways: i) building a sparsifying dictionary based on a mathematical model of the data, or ii) learning a dictionary to perform best on a training set. In this paper we describe the evolution of these two paradigms. As manifestations of the first approach, we cover topics such as wavelets, wavelet packets, contourlets, and curvelets, all aiming to exploit 1-D and 2-D mathematical models for constructing effective dictionaries for signals and images. Dictionary learning takes a different route, attaching the dictionary to a set of examples it is supposed to serve. From the seminal work of Field and Olshausen, through the MOD, the K-SVD, the Generalized PCA and others, this paper surveys the various options such training has to offer, up to the most recent contributions and structures.},
  number = {6},
  journaltitle = {Proceedings of the IEEE},
  date = {2010-06},
  pages = {1045-1057},
  keywords = {Harmonic analysis,Mathematical model,Wavelet packets,wavelet transforms,Dictionaries,Signal representations,sparse coding,sparse representation,signal representation,signal sampling,dictionary learning,mathematical data model,redundant signal representation modeling,sparse signal representation modeling,training set,Displays,Joining processes,Sampling methods,signal approximation,Signal processing,Principal component analysis,Dictionary learning,harmonic analysis},
  author = {Rubinstein, R. and Bruckstein, A. M. and Elad, M.},
  file = {/Users/fergalcotter/Dropbox/Papers/Rubinstein et al_2010_Dictionaries for Sparse Representation Modeling2.pdf;/Users/fergalcotter/Zotero/storage/BUJR2SX7/5452966.html},
  note = {01071}
}

@thesis{kendall_geometry_2017,
  location = {{Cambridge, UK}},
  title = {Geometry and {{Uncertainty}} in {{Deep Learning}} for {{Computer Vision}}},
  institution = {{University of Cambridge}},
  type = {PhD Thesis},
  date = {2017-11},
  author = {Kendall, Alex},
  file = {/Users/fergalcotter/Dropbox/Papers/Kendall - 2017 - Geometry and Uncertainty in Deep Learning for Comp.pdf},
  note = {00000}
}

@thesis{van_der_westhuizen_biological_2018,
  location = {{Cambridge, UK}},
  title = {Biological Applications, Visualizations, and Extensions of the Long Short-Term Memory Network},
  pagetotal = {194},
  institution = {{University of Cambridge}},
  type = {PhD Thesis},
  date = {2018-11},
  author = {van der Westhuizen, Josias},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Dropbox/Papers/van der Westhuizen - 2018 - Biological applications, visualizations, and exten.pdf},
  note = {00000}
}

@article{arjovsky_wasserstein_2017-1,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1701.07875},
  primaryClass = {cs, stat},
  title = {Wasserstein {{GAN}}},
  url = {http://arxiv.org/abs/1701.07875},
  abstract = {We introduce a new algorithm named WGAN, an alternative to traditional GAN training. In this new model, we show that we can improve the stability of learning, get rid of problems like mode collapse, and provide meaningful learning curves useful for debugging and hyperparameter searches. Furthermore, we show that the corresponding optimization problem is sound, and provide extensive theoretical work highlighting the deep connections to other distances between distributions.},
  urldate = {2019-01-04},
  date = {2017-01-26},
  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
  author = {Arjovsky, Martin and Chintala, Soumith and Bottou, Léon},
  file = {/Users/fergalcotter/Dropbox/Papers/Arjovsky et al_2017_Wasserstein GAN2.pdf;/Users/fergalcotter/Zotero/storage/BLMCDSCF/1701.html},
  note = {01190}
}

@article{beirlant_nonparametric_1997,
  title = {Nonparametric {{Entropy Estimation}}: {{An Overview}}},
  volume = {6},
  shorttitle = {Nonparametric {{Entropy Estimation}}},
  abstract = {An overview is given of the several methods in use for the nonparametric estimation of the dierential entropy of a continuous random variable. The properties of various methods are compared. Several applications are given such as tests for goodness-of-t, parameter estimation, quantization theory and spectral estimation.},
  journaltitle = {International Journal of Mathematical and Statistical Sciences},
  shortjournal = {International Journal of Mathematical and Statistical Sciences},
  date = {1997-01-01},
  author = {Beirlant, Jan and J. Dudewicz, E and Gyor, L and Meulen, E.C.},
  file = {/Users/fergalcotter/Dropbox/Papers/Beirlant et al_1997_Nonparametric Entropy Estimation.pdf},
  note = {00647}
}

@inproceedings{perez-cruz_kullback-leibler_2008,
  title = {Kullback-{{Leibler}} Divergence Estimation of Continuous Distributions},
  doi = {10.1109/ISIT.2008.4595271},
  abstract = {We present a method for estimating the KL divergence between continuous densities and we prove it converges almost surely. Divergence estimation is typically solved estimating the densities first. Our main result shows this intermediate step is unnecessary and that the divergence can be either estimated using the empirical cdf or k-nearest-neighbour density estimation, which does not converge to the true measure for finite k. The convergence proof is based on describing the statistics of our estimator using waiting-times distributions, as the exponential or Erlang. We illustrate the proposed estimators and show how they compare to existing methods based on density estimation, and we also outline how our divergence estimators can be used for solving the two-sample problem.},
  eventtitle = {2008 {{IEEE International Symposium}} on {{Information Theory}}},
  booktitle = {2008 {{IEEE International Symposium}} on {{Information Theory}}},
  date = {2008-07},
  pages = {1666-1670},
  keywords = {Estimation,Approximation methods,Convergence,density estimation,Density measurement,Entropy,Exponential distribution,information theory,k-nearest-neighbour density estimation,Kullback-Leibler divergence estimation,Random variables,waiting-times distributions},
  author = {Perez-Cruz, F.},
  file = {/Users/fergalcotter/Dropbox/Papers/Perez-Cruz_2008_Kullback-Leibler divergence estimation of continuous distributions.pdf;/Users/fergalcotter/Zotero/storage/XKAJDF8K/4595271.html},
  note = {00113}
}

@online{noauthor_theoretical_nodate,
  langid = {american},
  title = {A {{Theoretical Argument}} for {{Complex}}-{{Valued Convolutional Networks}}},
  url = {https://research.fb.com/publications/a-theoretical-argument-for-complex-valued-convolutional-networks},
  abstract = {This article provides foundations for certain convolutional networks.},
  journaltitle = {Facebook Research},
  urldate = {2019-01-09},
  file = {/Users/fergalcotter/Zotero/storage/CIH8BDGF/a-theoretical-argument-for-complex-valued-convolutional-networks.html},
  note = {00011}
}

@article{sun_supervised_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1701.08349},
  primaryClass = {cs},
  title = {Supervised {{Deep Sparse Coding Networks}}},
  url = {http://arxiv.org/abs/1701.08349},
  abstract = {In this paper, we describe the deep sparse coding network (SCN), a novel deep network that encodes intermediate representations with nonnegative sparse coding. The SCN is built upon a number of cascading bottleneck modules, where each module consists of two sparse coding layers with relatively wide and slim dictionaries that are specialized to produce high dimensional discriminative features and low dimensional representations for clustering, respectively. During training, both the dictionaries and regularization parameters are optimized with an end-to-end supervised learning algorithm based on multilevel optimization. Effectiveness of an SCN with seven bottleneck modules is verified on several popular benchmark datasets. Remarkably, with few parameters to learn, our SCN achieves 5.81\% and 19.93\% classification error rate on CIFAR-10 and CIFAR-100, respectively.},
  urldate = {2019-01-09},
  date = {2017-01-28},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Sun, Xiaoxia and Nasrabadi, Nasser M. and Tran, Trac D.},
  file = {/Users/fergalcotter/Dropbox/Papers/Sun et al_2017_Supervised Deep Sparse Coding Networks.pdf;/Users/fergalcotter/Zotero/storage/TYIJ6ZAZ/1701.html},
  note = {00003}
}

@article{vaidyanathan_polyphase_1988,
  title = {Polyphase Networks, Block Digital Filtering, {{LPTV}} Systems, and Alias-Free {{QMF}} Banks: A Unified Approach Based on Pseudocirculants},
  volume = {36},
  doi = {10.1109/29.1535},
  shorttitle = {Polyphase Networks, Block Digital Filtering, {{LPTV}} Systems, and Alias-Free {{QMF}} Banks},
  abstract = {In this paper, the relation between hlock digital filtering and quadrature mirror filter (QMF) hanks is explored. Necessary and sufficient conditions for alias cancellation in QMF hanks are expressed in terms of an associated matrix, derived from the polyphase components of the analysis and synthesis filters. These conditions, called the pseudocirculant conditions, enahle us to directly unite QMF hanks with the framework of hlock digital filtering. Ahsence of amplitude distortion in an alias-free QMF hank translates into the “losslessnes” property of the pseudocirculant matrix involved.},
  journaltitle = {IEEE Trans. Acoustics, Speech, and Signal Processing},
  date = {1988},
  pages = {381-391},
  keywords = {Distortion,Low-power broadcasting,Patrick Hanks,Polyphase matrix,Polyphase quadrature filter,Quadrature mirror filter},
  author = {Vaidyanathan, Palghat P. and Mitra, Sanjit K.},
  file = {/Users/fergalcotter/Dropbox/Papers/Vaidyanathan_Mitra_1988_Polyphase networks, block digital filtering, LPTV systems, and alias-free QMF.pdf},
  note = {00136}
}

@article{qiu_dcfnet:_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1802.04145},
  primaryClass = {cs, stat},
  title = {{{DCFNet}}: {{Deep Neural Network}} with {{Decomposed Convolutional Filters}}},
  url = {http://arxiv.org/abs/1802.04145},
  shorttitle = {{{DCFNet}}},
  abstract = {Filters in a Convolutional Neural Network (CNN) contain model parameters learned from enormous amounts of data. In this paper, we suggest to decompose convolutional filters in CNN as a truncated expansion with pre-fixed bases, namely the Decomposed Convolutional Filters network (DCFNet), where the expansion coefficients remain learned from data. Such a structure not only reduces the number of trainable parameters and computation, but also imposes filter regularity by bases truncation. Through extensive experiments, we consistently observe that DCFNet maintains accuracy for image classification tasks with a significant reduction of model parameters, particularly with Fourier-Bessel (FB) bases, and even with random bases. Theoretically, we analyze the representation stability of DCFNet with respect to input variations, and prove representation stability under generic assumptions on the expansion coefficients. The analysis is consistent with the empirical observations.},
  urldate = {2019-01-22},
  date = {2018-02-12},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  author = {Qiu, Qiang and Cheng, Xiuyuan and Calderbank, Robert and Sapiro, Guillermo},
  file = {/Users/fergalcotter/Zotero/storage/AL9FW7DH/Qiu et al. - 2018 - DCFNet Deep Neural Network with Decomposed Convol.pdf;/Users/fergalcotter/Zotero/storage/Y4N68GJF/1802.html}
}

@article{best_rna-seq_2015,
  title = {{{RNA}}-{{Seq}} of {{Tumor}}-{{Educated Platelets Enables Blood}}-{{Based Pan}}-{{Cancer}}, {{Multiclass}}, and {{Molecular Pathway Cancer Diagnostics}}},
  volume = {28},
  issn = {1535-6108},
  url = {http://www.sciencedirect.com/science/article/pii/S1535610815003499},
  doi = {10.1016/j.ccell.2015.09.018},
  abstract = {Summary
Tumor-educated blood platelets (TEPs) are implicated as central players in the systemic and local responses to tumor growth, thereby altering their RNA profile. We determined the diagnostic potential of TEPs by mRNA sequencing of 283 platelet samples. We distinguished 228 patients with localized and metastasized tumors from 55 healthy individuals with 96\% accuracy. Across six different tumor types, the location of the primary tumor was correctly identified with 71\% accuracy. Also, MET or HER2-positive, and mutant KRAS, EGFR, or PIK3CA tumors were accurately distinguished using surrogate TEP mRNA profiles. Our results indicate that blood platelets provide a valuable platform for pan-cancer, multiclass cancer, and companion diagnostics, possibly enabling clinical advances in blood-based “liquid biopsies”.},
  number = {5},
  journaltitle = {Cancer Cell},
  shortjournal = {Cancer Cell},
  urldate = {2019-01-23},
  date = {2015-11-09},
  pages = {666-676},
  author = {Best, Myron G. and Sol, Nik and Kooi, Irsan and Tannous, Jihane and Westerman, Bart A. and Rustenburg, François and Schellen, Pepijn and Verschueren, Heleen and Post, Edward and Koster, Jan and Ylstra, Bauke and Ameziane, Najim and Dorsman, Josephine and Smit, Egbert F. and Verheul, Henk M. and Noske, David P. and Reijneveld, Jaap C. and Nilsson, R. Jonas A. and Tannous, Bakhos A. and Wesseling, Pieter and Wurdinger, Thomas},
  file = {/Users/fergalcotter/Dropbox/Papers/Best et al_2015_RNA-Seq of Tumor-Educated Platelets Enables Blood-Based Pan-Cancer, Multiclass,.pdf;/Users/fergalcotter/Zotero/storage/ND869NXD/S1535610815003499.html},
  note = {00228}
}

@article{wang_degseq:_2010,
  langid = {english},
  title = {{{DEGseq}}: An {{R}} Package for Identifying Differentially Expressed Genes from {{RNA}}-Seq Data},
  volume = {26},
  issn = {1367-4803},
  url = {https://academic.oup.com/bioinformatics/article/26/1/136/182236},
  doi = {10.1093/bioinformatics/btp612},
  shorttitle = {{{DEGseq}}},
  abstract = {Abstract.  Summary: High-throughput RNA sequencing (RNA-seq) is rapidly emerging as a major quantitative transcriptome profiling platform. Here, we present DEGs},
  number = {1},
  journaltitle = {Bioinformatics},
  shortjournal = {Bioinformatics},
  urldate = {2019-01-23},
  date = {2010-01-01},
  pages = {136-138},
  author = {Wang, Likun and Feng, Zhixing and Wang, Xi and Wang, Xiaowo and Zhang, Xuegong},
  file = {/Users/fergalcotter/Dropbox/Papers/Wang et al_2010_DEGseq.pdf;/Users/fergalcotter/Zotero/storage/JUWYTNFP/182236.html},
  note = {01516}
}

@article{costa-silva_rna-seq_2017,
  langid = {english},
  title = {{{RNA}}-{{Seq}} Differential Expression Analysis: {{An}} Extended Review and a Software Tool},
  volume = {12},
  issn = {1932-6203},
  url = {https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0190152},
  doi = {10.1371/journal.pone.0190152},
  shorttitle = {{{RNA}}-{{Seq}} Differential Expression Analysis},
  abstract = {The correct identification of differentially expressed genes (DEGs) between specific conditions is a key in the understanding phenotypic variation. High-throughput transcriptome sequencing (RNA-Seq) has become the main option for these studies. Thus, the number of methods and softwares for differential expression analysis from RNA-Seq data also increased rapidly. However, there is no consensus about the most appropriate pipeline or protocol for identifying differentially expressed genes from RNA-Seq data. This work presents an extended review on the topic that includes the evaluation of six methods of mapping reads, including pseudo-alignment and quasi-mapping and nine methods of differential expression analysis from RNA-Seq data. The adopted methods were evaluated based on real RNA-Seq data, using qRT-PCR data as reference (gold-standard). As part of the results, we developed a software that performs all the analysis presented in this work, which is freely available at https://github.com/costasilvati/consexpression. The results indicated that mapping methods have minimal impact on the final DEGs analysis, considering that adopted data have an annotated reference genome. Regarding the adopted experimental model, the DEGs identification methods that have more consistent results were the limma+voom, NOIseq and DESeq2. Additionally, the consensus among five DEGs identification methods guarantees a list of DEGs with great accuracy, indicating that the combination of different methods can produce more suitable results. The consensus option is also included for use in the available software.},
  number = {12},
  journaltitle = {PLOS ONE},
  shortjournal = {PLOS ONE},
  urldate = {2019-01-23},
  date = {2017-12-21},
  pages = {e0190152},
  keywords = {Gene expression,Genome analysis,Genome annotation,Human genomics,RNA analysis,RNA sequencing,Software tools,Transcriptome analysis},
  author = {Costa-Silva, Juliana and Domingues, Douglas and Lopes, Fabricio Martins},
  file = {/Users/fergalcotter/Dropbox/Papers/Costa-Silva et al_2017_RNA-Seq differential expression analysis.pdf;/Users/fergalcotter/Zotero/storage/Z6PN7N63/article.html},
  note = {00029}
}

@article{zhang_identifying_2017,
  title = {Identifying and Analyzing Different Cancer Subtypes Using {{RNA}}-Seq Data of Blood Platelets},
  volume = {8},
  issn = {1949-2553},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5675649/},
  doi = {10.18632/oncotarget.20903},
  abstract = {Detection and diagnosis of cancer are especially important for early prevention and effective treatments. Traditional methods of cancer detection are usually time-consuming and expensive. Liquid biopsy, a newly proposed noninvasive detection approach, can promote the accuracy and decrease the cost of detection according to a personalized expression profile. However, few studies have been performed to analyze this type of data, which can promote more effective methods for detection of different cancer subtypes. In this study, we applied some reliable machine learning algorithms to analyze data retrieved from patients who had one of six cancer subtypes (breast cancer, colorectal cancer, glioblastoma, hepatobiliary cancer, lung cancer and pancreatic cancer) as well as healthy persons. Quantitative gene expression profiles were used to encode each sample. Then, they were analyzed by the maximum relevance minimum redundancy method. Two feature lists were obtained in which genes were ranked rigorously. The incremental feature selection method was applied to the mRMR feature list to extract the optimal feature subset, which can be used in the support vector machine algorithm to determine the best performance for the detection of cancer subtypes and healthy controls. The ten-fold cross-validation for the constructed optimal classification model yielded an overall accuracy of 0.751. On the other hand, we extracted the top eighteen features (genes), including TTN, RHOH, RPS20, TRBC2, in another feature list, the MaxRel feature list, and performed a detailed analysis of them. The results indicated that these genes could be important biomarkers for discriminating different cancer subtypes and healthy controls.},
  number = {50},
  journaltitle = {Oncotarget},
  shortjournal = {Oncotarget},
  urldate = {2019-01-23},
  date = {2017-09-15},
  pages = {87494-87511},
  author = {Zhang, Yu-Hang and Huang, Tao and Chen, Lei and Xu, YaoChen and Hu, Yu and Hu, Lan-Dian and Cai, Yudong and Kong, Xiangyin},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhang et al_2017_Identifying and analyzing different cancer subtypes using RNA-seq data of blood.pdf},
  eprinttype = {pmid},
  eprint = {29152097},
  pmcid = {PMC5675649}
}

@article{cotter_deep_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1811.06115},
  primaryClass = {cs},
  title = {Deep {{Learning}} in the {{Wavelet Domain}}},
  url = {http://arxiv.org/abs/1811.06115},
  abstract = {This paper examines the possibility of, and the possible advantages to learning the filters of convolutional neural networks (CNNs) for image analysis in the wavelet domain. We are stimulated by both Mallat's scattering transform and the idea of filtering in the Fourier domain. It is important to explore new spaces in which to learn, as these may provide inherent advantages that are not available in the pixel space. However, the scattering transform is limited by its inability to learn in between scattering orders, and any Fourier domain filtering is limited by the large number of filter parameters needed to get localized filters. Instead we consider filtering in the wavelet domain with learnable filters. The wavelet space allows us to have local, smooth filters with far fewer parameters, and learnability can give us flexibility. We present a novel layer which takes CNN activations into the wavelet space, learns parameters and returns to the pixel space. This allows it to be easily dropped in to any neural network without affecting the structure. As part of this work, we show how to pass gradients through a multirate system and give preliminary results.},
  urldate = {2019-01-26},
  date = {2018-11-14},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Cotter, Fergal and Kingsbury, Nick},
  file = {/Users/fergalcotter/Zotero/storage/UU5U8MUZ/Cotter and Kingsbury - 2018 - Deep Learning in the Wavelet Domain.pdf;/Users/fergalcotter/Zotero/storage/K9352WDE/1811.html},
  note = {00000}
}

@inproceedings{li_tiny_2017,
  location = {{Stanford cs231n}},
  title = {Tiny {{ImageNet Visual Recognition Challenge}}},
  url = {https://tiny-imagenet.herokuapp.com/},
  publisher = {{https://tiny-imagenet.herokuapp.com/}},
  date = {2017},
  author = {Li, Fei-Fei},
  note = {00003}
}

@software{cotter_pytorch_2018,
%  location = {{GitHub fbcotter/pytorch\_wavelets}},
  title = {Pytorch {{Wavelets}} Software},
  language = {Python},
  url = {https://github.com/fbcotter/pytorch_wavelets},
  date = {2018},
  author = {Cotter, Fergal},
  note = {https://github.com/fbcotter/pytorch\_wavelets}
}

@online{zagoruyko_cifar-10_2015,
  title = {{{CIFAR}}-10 in {{Torch}}},
  url = {http://torch.ch/blog/2015/07/30/cifar.html},
  date = {2015-07},
  author = {Zagoruyko, Sergey},
  note = {http://torch.ch/blog/2015/07/30/cifar.html}
}

@online{geifman_cifar-vgg_nodate,
  title = {Cifar-Vgg},
  author = {Geifman, Yonatan}
}

@inproceedings{liu_very_2015,
  title = {Very Deep Convolutional Neural Network Based Image Classification Using Small Training Sample Size},
  doi = {10.1109/ACPR.2015.7486599},
  abstract = {Since Krizhevsky won the ImageNet Large Scale Visual Recognition Challenge (ILSVRC) 2012 competition with the brilliant deep convolutional neural networks (D-CNNs), researchers have designed lots of D-CNNs. However, almost all the existing very deep convolutional neural networks are trained on the giant ImageNet datasets. Small datasets like CIFAR-10 has rarely taken advantage of the power of depth since deep models are easy to overfit. In this paper, we proposed a modified VGG-16 network and used this model to fit CIFAR-10. By adding stronger regularizer and using Batch Normalization, we achieved 8.45\% error rate on CIFAR-10 without severe overfitting. Our results show that the very deep CNN can be used to fit small datasets with simple and proper modifications and don't need to re-design specific small networks. We believe that if a model is strong enough to fit a large dataset, it can also fit a small one.},
  eventtitle = {2015 3rd {{IAPR Asian Conference}} on {{Pattern Recognition}} ({{ACPR}})},
  booktitle = {2015 3rd {{IAPR Asian Conference}} on {{Pattern Recognition}} ({{ACPR}})},
  date = {2015-11},
  pages = {730-734},
  keywords = {Computational modeling,Convolution,Error analysis,Neural networks,Training,image classification,neural nets,Data models,Acceleration,batch normalization,CIFAR-10,D-CNNs,deep convolutional neural networks,ImageNet datasets,imagenet large scale visual recognition challenge,Krizhevsky,VGG-16 network},
  author = {Liu, S. and Deng, W.},
  file = {/Users/fergalcotter/Dropbox/Papers/Liu_Deng_2015_Very deep convolutional neural network based image classification using small.pdf;/Users/fergalcotter/Zotero/storage/4MWNW4QD/7486599.html},
  note = {00032}
}

@article{srivastava_highway_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1505.00387},
  primaryClass = {cs},
  title = {Highway {{Networks}}},
  url = {http://arxiv.org/abs/1505.00387},
  abstract = {There is plenty of theoretical and empirical evidence that depth of neural networks is a crucial ingredient for their success. However, network training becomes more difficult with increasing depth and training of very deep networks remains an open problem. In this extended abstract, we introduce a new architecture designed to ease gradient-based training of very deep networks. We refer to networks with this architecture as highway networks, since they allow unimpeded information flow across several layers on "information highways". The architecture is characterized by the use of gating units which learn to regulate the flow of information through a network. Highway networks with hundreds of layers can be trained directly using stochastic gradient descent and with a variety of activation functions, opening up the possibility of studying extremely deep and efficient architectures.},
  urldate = {2019-02-08},
  date = {2015-05-02},
  keywords = {68T01,Computer Science - Neural and Evolutionary Computing,G.1.6,I.2.6,Computer Science - Machine Learning},
  author = {Srivastava, Rupesh Kumar and Greff, Klaus and Schmidhuber, Jürgen},
  file = {/Users/fergalcotter/Dropbox/Papers/Srivastava et al_2015_Highway Networks.pdf;/Users/fergalcotter/Zotero/storage/9REPV3M3/1505.html},
  note = {00662}
}

@article{romero_fitnets:_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1412.6550},
  primaryClass = {cs},
  title = {{{FitNets}}: {{Hints}} for {{Thin Deep Nets}}},
  url = {http://arxiv.org/abs/1412.6550},
  shorttitle = {{{FitNets}}},
  abstract = {While depth tends to improve network performances, it also makes gradient-based training more difficult since deeper networks tend to be more non-linear. The recently proposed knowledge distillation approach is aimed at obtaining small and fast-to-execute models, and it has shown that a student network could imitate the soft output of a larger teacher network or ensemble of networks. In this paper, we extend this idea to allow the training of a student that is deeper and thinner than the teacher, using not only the outputs but also the intermediate representations learned by the teacher as hints to improve the training process and final performance of the student. Because the student intermediate hidden layer will generally be smaller than the teacher's intermediate hidden layer, additional parameters are introduced to map the student hidden layer to the prediction of the teacher hidden layer. This allows one to train deeper students that can generalize better or run faster, a trade-off that is controlled by the chosen student capacity. For example, on CIFAR-10, a deep student network with almost 10.4 times less parameters outperforms a larger, state-of-the-art teacher network.},
  urldate = {2019-02-08},
  date = {2014-12-19},
  keywords = {Computer Science - Neural and Evolutionary Computing,Computer Science - Machine Learning},
  author = {Romero, Adriana and Ballas, Nicolas and Kahou, Samira Ebrahimi and Chassang, Antoine and Gatta, Carlo and Bengio, Yoshua},
  file = {/Users/fergalcotter/Dropbox/Papers/Romero et al_2014_FitNets.pdf;/Users/fergalcotter/Zotero/storage/F8WFUNRI/1412.html}
}

@online{noauthor_pdf_nodate,
  langid = {english},
  title = {({{PDF}}) {{Multi}}-{{Resolution Dual}}-{{Tree Wavelet Scattering Network}} for {{Signal Classification}}},
  url = {https://www.researchgate.net/publication/313671597_Multi-Resolution_Dual-Tree_Wavelet_Scattering_Network_for_Signal_Classification},
  abstract = {PDF | This paper introduces a Deep Scattering network that utilizes Dual-Tree complex wavelets to extract translation invariant representations from an input signal. The computationally efficient Dual-Tree wavelets decompose the input signal into densely spaced representations over...},
  journaltitle = {ResearchGate},
  urldate = {2019-02-09},
  file = {/Users/fergalcotter/Zotero/storage/R3X5N5HL/313671597_Multi-Resolution_Dual-Tree_Wavelet_Scattering_Network_for_Signal_Classification.html},
  note = {00006}
}

@inproceedings{singh_multi-resolution_2016,
  langid = {english},
  location = {{Birmingham, UK}},
  title = {Multi-{{Resolution Dual}}-{{Tree Wavelet Scattering Network}} for {{Signal Classification}}},
  url = {https://www.repository.cam.ac.uk/handle/1810/262746},
  doi = {10.17863/CAM.8036},
  abstract = {This paper introduces a Deep Scattering network that utilizes Dual-Tree complex wavelets to extract multi-scale translation invariant representations from an input signal. The computationally efficient Dual-Tree wavelets decompose the input signal into equally spaced representations over scales. Translation invariance is introduced in the representations by applying a non-linearity over a region followed by averaging. The discriminatory information from the equally spaced locally smooth signal representations aids the learning of the classi- fier. The proposed network is shown to outperform Mallat’s ScatterNet [1] on four datasets with different modalities, both for classification accuracy and computational efficiency.},
  eventtitle = {International {{Conference}} on {{Mathematics}} in {{Signal Processing}}},
  booktitle = {International {{Conference}} on {{Mathematics}} in {{Signal Processing}}},
  urldate = {2019-02-09},
  date = {2016-12-14},
  author = {Singh, Amarjot and Kingsbury, Nicholas},
  file = {/Users/fergalcotter/Dropbox/Papers/Singh_Kingsbury_2016_Multi-Resolution Dual-Tree Wavelet Scattering Network for Signal Classification.pdf;/Users/fergalcotter/Zotero/storage/WYFMHMFW/262746.html},
  note = {00006}
}

@inproceedings{springenberg_striving_2015,
  archivePrefix = {arXiv},
  %eprinttype = {arxiv},
  %eprint = {1412.6806},
  primaryClass = {cs},
  location = {{San Diego, CA, USA}},
  title = {Striving for {{Simplicity}}: {{The All Convolutional Net}}},
  url = {http://arxiv.org/abs/1412.6806},
  shorttitle = {Striving for {{Simplicity}}},
  abstract = {Most modern convolutional neural networks (CNNs) used for object recognition are built using the same principles: Alternating convolution and max-pooling layers followed by a small number of fully connected layers. We re-evaluate the state of the art for object recognition from small images with convolutional networks, questioning the necessity of different components in the pipeline. We find that max-pooling can simply be replaced by a convolutional layer with increased stride without loss in accuracy on several image recognition benchmarks. Following this finding -- and building on other recent work for finding simple network structures -- we propose a new architecture that consists solely of convolutional layers and yields competitive or state of the art performance on several object recognition datasets (CIFAR-10, CIFAR-100, ImageNet). To analyze the network we introduce a new variant of the "deconvolution approach" for visualizing features learned by CNNs, which can be applied to a broader range of network structures than existing approaches.},
  eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}})},
  booktitle = {Proceedings of the International {{Conference}} on {{Learning Representations}} ({{ICLR}})},
  urldate = {2019-02-09},
  date = {2015-05},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Springenberg, Jost Tobias and Dosovitskiy, Alexey and Brox, Thomas and Riedmiller, Martin},
  file = {/Users/fergalcotter/Dropbox/Papers/Springenberg et al_2014_Striving for Simplicity2.pdf;/Users/fergalcotter/Zotero/storage/XDTGTKT3/1412.html},
  note = {01095}
}

@inproceedings{worrall_harmonic_2017,
  langid = {english},
  location = {{Honolulu, HI}},
  title = {Harmonic {{Networks}}: {{Deep Translation}} and {{Rotation Equivariance}}},
  isbn = {978-1-5386-0457-1},
  url = {http://ieeexplore.ieee.org/document/8100241/},
  doi = {10.1109/CVPR.2017.758},
  shorttitle = {Harmonic {{Networks}}},
  abstract = {Translating or rotating an input image should not affect the results of many computer vision tasks. Convolutional neural networks (CNNs) are already translation equivariant: input image translations produce proportionate feature map translations. This is not the case for rotations. Global rotation equivariance is typically sought through data augmentation, but patch-wise equivariance is more difficult. We present Harmonic Networks or H-Nets, a CNN exhibiting equivariance to patch-wise translation and 360-rotation. We achieve this by replacing regular CNN filters with circular harmonics, returning a maximal response and orientation for every receptive field patch.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2017 {{IEEE Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  urldate = {2019-02-09},
  date = {2017-07},
  pages = {7168-7177},
  author = {Worrall, Daniel E. and Garbin, Stephan J. and Turmukhambetov, Daniyar and Brostow, Gabriel J.},
  file = {/Users/fergalcotter/Zotero/storage/HQAZQQP6/Worrall et al. - 2017 - Harmonic Networks Deep Translation and Rotation E.pdf},
  note = {00089}
}

@article{ali_machine_2019,
  langid = {english},
  title = {Machine Learning and Feature Selection for Drug Response Prediction in Precision Oncology Applications},
  volume = {11},
  issn = {1867-2469},
  url = {https://doi.org/10.1007/s12551-018-0446-z},
  doi = {10.1007/s12551-018-0446-z},
  abstract = {In-depth modeling of the complex interplay among multiple omics data measured from cancer cell lines or patient tumors is providing new opportunities toward identification of tailored therapies for individual cancer patients. Supervised machine learning algorithms are increasingly being applied to the omics profiles as they enable integrative analyses among the high-dimensional data sets, as well as personalized predictions of therapy responses using multi-omics panels of response-predictive biomarkers identified through feature selection and cross-validation. However, technical variability and frequent missingness in input “big data” require the application of dedicated data preprocessing pipelines that often lead to some loss of information and compressed view of the biological signal. We describe here the state-of-the-art machine learning methods for anti-cancer drug response modeling and prediction and give our perspective on further opportunities to make better use of high-dimensional multi-omics profiles along with knowledge about cancer pathways targeted by anti-cancer compounds when predicting their phenotypic responses.},
  number = {1},
  journaltitle = {Biophysical Reviews},
  shortjournal = {Biophys Rev},
  urldate = {2019-02-20},
  date = {2019-02-01},
  pages = {31-39},
  keywords = {Feature selection,Drug response prediction,Multi-view regression,Omics profiling,Precision oncology,Predictive biomarkers},
  author = {Ali, Mehreen and Aittokallio, Tero},
  file = {/Users/fergalcotter/Dropbox/Papers/Ali_Aittokallio_2019_Machine learning and feature selection for drug response prediction in.pdf}
}

@thesis{jacobsen_structured_2018,
  title = {Structured {{Convolutional Networks}}},
  institution = {{University of Amsterdam}},
  type = {PhD Thesis},
  date = {2018},
  author = {Jacobsen, Jorn-Henrik},
  file = {/Users/fergalcotter/Dropbox/Papers/Jacobsen - 2018 - Structured Convolutional Networks.pdf},
  note = {00000}
}

@article{li_hyperband:_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1603.06560},
  primaryClass = {cs, stat},
  title = {Hyperband: {{A Novel Bandit}}-{{Based Approach}} to {{Hyperparameter Optimization}}},
  url = {http://arxiv.org/abs/1603.06560},
  shorttitle = {Hyperband},
  abstract = {Performance of machine learning algorithms depends critically on identifying a good set of hyperparameters. While recent approaches use Bayesian optimization to adaptively select configurations, we focus on speeding up random search through adaptive resource allocation and early-stopping. We formulate hyperparameter optimization as a pure-exploration non-stochastic infinite-armed bandit problem where a predefined resource like iterations, data samples, or features is allocated to randomly sampled configurations. We introduce a novel algorithm, Hyperband, for this framework and analyze its theoretical properties, providing several desirable guarantees. Furthermore, we compare Hyperband with popular Bayesian optimization methods on a suite of hyperparameter optimization problems. We observe that Hyperband can provide over an order-of-magnitude speedup over our competitor set on a variety of deep-learning and kernel-based learning problems.},
  urldate = {2019-02-26},
  date = {2016-03-21},
  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
  author = {Li, Lisha and Jamieson, Kevin and DeSalvo, Giulia and Rostamizadeh, Afshin and Talwalkar, Ameet},
  file = {/Users/fergalcotter/Zotero/storage/YIK5ZAK9/Li et al. - 2016 - Hyperband A Novel Bandit-Based Approach to Hyperp.pdf;/Users/fergalcotter/Zotero/storage/TNSG8YKE/1603.html}
}

@article{liaw2018tune,
  title = {Tune: {{A Research Platform}} for {{Distributed Model Selection}} and {{Training}}},
  journaltitle = {arXiv preprint arXiv:1807.05118},
  date = {2018},
  author = {Liaw, Richard and Liang, Eric and Nishihara, Robert and Moritz, Philipp and Gonzalez, Joseph E and Stoica, Ion}
}

@inproceedings{hutter_efficient_2014,
  title = {An {{Efficient Approach}} for {{Assessing Hyperparameter Importance}}},
  url = {http://dl.acm.org/citation.cfm?id=3044805.3044891},
  abstract = {The performance of many machine learning methods depends critically on hyperparameter settings. Sophisticated Bayesian optimization methods have recently achieved considerable successes in optimizing these hyperparameters, in several cases surpassing the performance of human experts. However, blind reliance on such methods can leave end users without insight into the relative importance of different hyperparameters and their interactions. This paper describes efficient methods that can be used to gain such insight, leveraging random forest models fit on the data already gathered by Bayesian optimization. We first introduce a novel, linear-time algorithm for computing marginals of random forest predictions and then show how to leverage these predictions within a functional ANOVA framework, to quantify the importance of both single hyperparameters and of interactions between hyperparameters. We conducted experiments with prominent machine learning frameworks and state-of-the-art solvers for combinatorial problems. We show that our methods provide insight into the relationship between hyperparameter settings and performance, and demonstrate that--even in very highdimensional cases--most performance variation is attributable to just a few hyperparameters.},
  booktitle = {Proceedings of the 31st {{International Conference}} on {{Machine Learning}} - {{Volume}} 32},
  series = {{{ICML}}'14},
  publisher = {{JMLR.org}},
  urldate = {2019-02-27},
  date = {2014},
  pages = {I-754--I-762},
  author = {Hutter, Frank and Hoos, Holger and Leyton-Brown, Kevin},
  venue = {Beijing, China}
}

@inproceedings{marcel_torchvision_2010,
  location = {{New York, NY, USA}},
  title = {Torchvision the {{Machine}}-Vision {{Package}} of {{Torch}}},
  isbn = {978-1-60558-933-6},
  url = {http://doi.acm.org/10.1145/1873951.1874254},
  doi = {10.1145/1873951.1874254},
  abstract = {This paper presents Torchvision an open source machine vision package for Torch. Torch is a machine learning library providing a series of the state-of-the-art algorithms such as Neural Networks, Support Vector Machines, Gaussian Mixture Models, Hidden Markov Models and many others. Torchvision provides additional functionalities to manipulate and process images with standard image processing algorithms. Hence, the resulting images can be used directly with the Torch machine learning algorithms as Torchvision is fully integrated with Torch. Both Torch and Torchvision are written in C++ language and are publicly available under the Free-BSD License.},
  booktitle = {Proceedings of the 18th {{ACM International Conference}} on {{Multimedia}}},
  series = {{{MM}} '10},
  publisher = {{ACM}},
  urldate = {2019-03-12},
  date = {2010},
  pages = {1485--1488},
  keywords = {machine learning,pattern recognition,vision,face detection and recognition,open source},
  author = {Marcel, Sébastien and Rodriguez, Yann},
  file = {/Users/fergalcotter/Dropbox/Papers/Marcel_Rodriguez_2010_Torchvision the Machine-vision Package of Torch.pdf},
  venue = {Firenze, Italy},
  note = {00011}
}

@article{louizos_causal_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1705.08821},
  primaryClass = {cs, stat},
  title = {Causal {{Effect Inference}} with {{Deep Latent}}-{{Variable Models}}},
  url = {http://arxiv.org/abs/1705.08821},
  abstract = {Learning individual-level causal effects from observational data, such as inferring the most effective medication for a specific patient, is a problem of growing importance for policy makers. The most important aspect of inferring causal effects from observational data is the handling of confounders, factors that affect both an intervention and its outcome. A carefully designed observational study attempts to measure all important confounders. However, even if one does not have direct access to all confounders, there may exist noisy and uncertain measurement of proxies for confounders. We build on recent advances in latent variable modeling to simultaneously estimate the unknown latent space summarizing the confounders and the causal effect. Our method is based on Variational Autoencoders (VAE) which follow the causal structure of inference with proxies. We show our method is significantly more robust than existing methods, and matches the state-of-the-art on previous benchmarks focused on individual treatment effects.},
  urldate = {2019-03-28},
  date = {2017-05-24},
  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
  author = {Louizos, Christos and Shalit, Uri and Mooij, Joris and Sontag, David and Zemel, Richard and Welling, Max},
  file = {/Users/fergalcotter/Zotero/storage/GWW7QHBW/Louizos et al. - 2017 - Causal Effect Inference with Deep Latent-Variable .pdf;/Users/fergalcotter/Zotero/storage/JNJTBBCJ/1705.html},
  note = {00039}
}

@incollection{nason_stationary_1995,
  langid = {english},
  location = {{New York, NY}},
  title = {The {{Stationary Wavelet Transform}} and Some {{Statistical Applications}}},
  isbn = {978-1-4612-2544-7},
  url = {https://doi.org/10.1007/978-1-4612-2544-7_17},
  abstract = {Wavelets are of wide potential use in statistical contexts. The basics of the discrete wavelet transform are reviewed using a filter notation that is useful subsequently in the paper. A ‘stationary wavelet transform’, where the coefficient sequences are not decimated at each stage, is described. Two different approaches to the construction of an inverse of the stationary wavelet transform are set out. The application of the stationary wavelet transform as an exploratory statistical method is discussed, together with its potential use in nonparametric regression. A method of local spectral density estimation is developed. This involves extensions to the wavelet context of standard time series ideas such as the periodogram and spectrum. The technique is illustrated by its application to data sets from astronomy and veterinary anatomy.},
  booktitle = {Wavelets and {{Statistics}}},
  series = {Lecture {{Notes}} in {{Statistics}}},
  publisher = {{Springer New York}},
  urldate = {2019-04-15},
  date = {1995},
  pages = {281-299},
  keywords = {Discrete Wavelet,Multiresolution Analysis,Nonparametric Regression,Original Sequence,Wavelet Packet},
  author = {Nason, G. P. and Silverman, B. W.},
  editor = {Antoniadis, Anestis and Oppenheim, Georges},
  file = {/Users/fergalcotter/Dropbox/Papers/Nason and Silverman - 1995 - The Stationary Wavelet Transform and some Statisti.pdf},
  doi = {10.1007/978-1-4612-2544-7_17},
  note = {01155}
}

@online{noauthor_[1710.03667]_nodate,
  title = {[1710.03667] {{High}}-Dimensional Dynamics of Generalization Error in Neural Networks},
  url = {https://arxiv.org/abs/1710.03667#},
  urldate = {2019-04-25},
  note = {00031}
}

@article{chang_cancer_2018,
  langid = {english},
  title = {Cancer {{Drug Response Profile}} Scan ({{CDRscan}}): {{A Deep Learning Model That Predicts Drug Effectiveness}} from {{Cancer Genomic Signature}}},
  volume = {8},
  issn = {2045-2322},
  url = {https://www.nature.com/articles/s41598-018-27214-6},
  doi = {10.1038/s41598-018-27214-6},
  shorttitle = {Cancer {{Drug Response Profile}} Scan ({{CDRscan}})},
  abstract = {In the era of precision medicine, cancer therapy can be tailored to an individual patient based on the genomic profile of a tumour. Despite the ever-increasing abundance of cancer genomic data, linking mutation profiles to drug efficacy remains a challenge. Herein, we report Cancer Drug Response profile scan (CDRscan) a novel deep learning model that predicts anticancer drug responsiveness based on a large-scale drug screening assay data encompassing genomic profiles of 787 human cancer cell lines and structural profiles of 244 drugs. CDRscan employs a two-step convolution architecture, where the genomic mutational fingerprints of cell lines and the molecular fingerprints of drugs are processed individually, then merged by ‘virtual docking’, an in silico modelling of drug treatment. Analysis of the goodness-of-fit between observed and predicted drug response revealed a high prediction accuracy of CDRscan (R2\,{$>$}\,0.84; AUROC\,{$>$}\,0.98). We applied CDRscan to 1,487 approved drugs and identified 14 oncology and 23 non-oncology drugs having new potential cancer indications. This, to our knowledge, is the first-time application of a deep learning model in predicting the feasibility of drug repurposing. By further clinical validation, CDRscan is expected to allow selection of the most effective anticancer drugs for the genomic profile of the individual patient.},
  number = {1},
  journaltitle = {Scientific Reports},
  urldate = {2019-05-01},
  date = {2018-06-11},
  pages = {8857},
  author = {Chang, Yoosup and Park, Hyejin and Yang, Hyun-Jin and Lee, Seungju and Lee, Kwee-Yum and Kim, Tae Soon and Jung, Jongsun and Shin, Jae-Min},
  file = {/Users/fergalcotter/Dropbox/Papers/CDRscan supplementary.pdf;/Users/fergalcotter/Dropbox/Papers/Chang et al_2018_Cancer Drug Response Profile scan (CDRscan).pdf;/Users/fergalcotter/Zotero/storage/U9BFACAL/s41598-018-27214-6.html}
}

@article{wen_deep-learning-based_2017,
  title = {Deep-{{Learning}}-{{Based Drug}}–{{Target Interaction Prediction}}},
  volume = {16},
  issn = {1535-3893},
  url = {https://doi.org/10.1021/acs.jproteome.6b00618},
  doi = {10.1021/acs.jproteome.6b00618},
  abstract = {Identifying interactions between known drugs and targets is a major challenge in drug repositioning. In silico prediction of drug–target interaction (DTI) can speed up the expensive and time-consuming experimental work by providing the most potent DTIs. In silico prediction of DTI can also provide insights about the potential drug–drug interaction and promote the exploration of drug side effects. Traditionally, the performance of DTI prediction depends heavily on the descriptors used to represent the drugs and the target proteins. In this paper, to accurately predict new DTIs between approved drugs and targets without separating the targets into different classes, we developed a deep-learning-based algorithmic framework named DeepDTIs. It first abstracts representations from raw input descriptors using unsupervised pretraining and then applies known label pairs of interaction to build a classification model. Compared with other methods, it is found that DeepDTIs reaches or outperforms other state-of-the-art methods. The DeepDTIs can be further used to predict whether a new drug targets to some existing targets or whether a new target interacts with some existing drugs.},
  number = {4},
  journaltitle = {Journal of Proteome Research},
  shortjournal = {J. Proteome Res.},
  urldate = {2019-05-01},
  date = {2017-04-07},
  pages = {1401-1409},
  author = {Wen, Ming and Zhang, Zhimin and Niu, Shaoyu and Sha, Haozhi and Yang, Ruihan and Yun, Yonghuan and Lu, Hongmei},
  file = {/Users/fergalcotter/Dropbox/Papers/Wen et al_2017_Deep-Learning-Based Drug–Target Interaction Prediction.pdf;/Users/fergalcotter/Zotero/storage/G852LWDY/acs.jproteome.html}
}

@article{yap_padel-descriptor:_2011,
  langid = {english},
  title = {{{PaDEL}}-Descriptor: {{An}} Open Source Software to Calculate Molecular Descriptors and Fingerprints},
  volume = {32},
  issn = {1096-987X},
  url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/jcc.21707},
  doi = {10.1002/jcc.21707},
  shorttitle = {{{PaDEL}}-Descriptor},
  abstract = {Introduction PaDEL-Descriptor is a software for calculating molecular descriptors and fingerprints. The software currently calculates 797 descriptors (663 1D, 2D descriptors, and 134 3D descriptors) and 10 types of fingerprints. These descriptors and fingerprints are calculated mainly using The Chemistry Development Kit. Some additional descriptors and fingerprints were added, which include atom type electrotopological state descriptors, McGowan volume, molecular linear free energy relation descriptors, ring counts, count of chemical substructures identified by Laggner, and binary fingerprints and count of chemical substructures identified by Klekota and Roth. Methods PaDEL-Descriptor was developed using the Java language and consists of a library component and an interface component. The library component allows it to be easily integrated into quantitative structure activity relationship software to provide the descriptor calculation feature while the interface component allows it to be used as a standalone software. The software uses a Master/Worker pattern to take advantage of the multiple CPU cores that are present in most modern computers to speed up calculations of molecular descriptors. Results The software has several advantages over existing standalone molecular descriptor calculation software. It is free and open source, has both graphical user interface and command line interfaces, can work on all major platforms (Windows, Linux, MacOS), supports more than 90 different molecular file formats, and is multithreaded. Conclusion PaDEL-Descriptor is a useful addition to the currently available molecular descriptor calculation software. The software can be downloaded at http://padel.nus.edu.sg/software/padeldescriptor. © 2010 Wiley Periodicals, Inc. J Comput Chem, 2011},
  number = {7},
  journaltitle = {Journal of Computational Chemistry},
  urldate = {2019-05-01},
  date = {2011},
  pages = {1466-1474},
  keywords = {open source,molecular descriptor,software},
  author = {Yap, Chun Wei},
  file = {/Users/fergalcotter/Dropbox/Papers/Yap_2011_PaDEL-descriptor.pdf;/Users/fergalcotter/Zotero/storage/FISVBKGZ/jcc.html}
}

@article{andreux_kymatio:_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1812.11214},
  primaryClass = {cs, eess, stat},
  title = {Kymatio: {{Scattering Transforms}} in {{Python}}},
  url = {http://arxiv.org/abs/1812.11214},
  shorttitle = {Kymatio},
  abstract = {The wavelet scattering transform is an invariant signal representation suitable for many signal processing and machine learning applications. We present the Kymatio software package, an easy-to-use, high-performance Python implementation of the scattering transform in 1D, 2D, and 3D that is compatible with modern deep learning frameworks. All transforms may be executed on a GPU (in addition to CPU), offering a considerable speed up over CPU implementations. The package also has a small memory footprint, resulting inefficient memory usage. The source code, documentation, and examples are available undera BSD license at https://www.kymat.io/},
  urldate = {2019-05-15},
  date = {2018-12-28},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Statistics - Machine Learning,Computer Science - Machine Learning,Computer Science - Sound,Electrical Engineering and Systems Science - Audio and Speech Processing},
  author = {Andreux, Mathieu and Angles, Tomás and Exarchakis, Georgios and Leonarduzzi, Roberto and Rochette, Gaspar and Thiry, Louis and Zarka, John and Mallat, Stéphane and Andén, Joakim and Belilovsky, Eugene and Bruna, Joan and Lostanlen, Vincent and Hirn, Matthew J. and Oyallon, Edouard and Zhang, Sixhin and Cella, Carmine and Eickenberg, Michael},
  file = {/Users/fergalcotter/Zotero/storage/SRFAJD65/Andreux et al. - 2018 - Kymatio Scattering Transforms in Python.pdf;/Users/fergalcotter/Zotero/storage/ERPPCAW7/1812.html}
}

@software{kingsbury_dtcwt_2003,
  title = {{{DTCWT Software}}},
  language = {Matlab},
  url = {http://sigproc.eng.cam.ac.uk/Main/NGK},
  version = {4.3},
  date = {2003},
  author = {Kingsbury, Nick}
}

@software{cai_2-d_2011,
  title = {2-{{D Dual}}-{{Tree Wavelet Transform}}},
  language = {Matlab},
  url = {http://eeweb.poly.edu/iselesni/WaveletSoftware/dt2D.html},
  date = {2011-11},
  author = {Cai, Shihua and Li, Keyong and Selesnick, Ivan}
}

@software{wareham_dtcwt_2014,
  title = {{{DTCWT}}},
  language = {Python},
  url = {https://pypi.org/project/dtcwt/},
  date = {2014},
  author = {Wareham, Rich and Shaffrey, Cian and Kingsbury, N.}
}

@inproceedings{galiano_improving_2011,
  title = {Improving the Discrete Wavelet Transform Computation from Multicore to {{GPU}}-Based Algorithms},
  shorttitle = {{{CMMSE}}},
  eventtitle = {International {{Conference}} on {{Computational}} and {{Mathematical Methods}} in {{Science}} and {{Engineering}}},
  booktitle = {Proceedings of the 11th {{International Conference}} on {{Computational}} and {{Mathematical Methods}} in {{Science}} and {{Engineering}}},
  date = {2011-06-26/2011-06-30},
  pages = {544-555},
  author = {Galiano, V. and Lopez, O. and Malumbres, M.P. and Migallon, H.}
}

@article{paszke_automatic_2017,
  title = {Automatic Differentiation in {{PyTorch}}},
  url = {https://openreview.net/forum?id=BJJsrmfCZ},
  abstract = {In this article, we describe an automatic differentiation module of PyTorch — a library designed to enable rapid research on machine learning models. It builds upon a few projects, most notably Lua...},
  urldate = {2019-05-21},
  date = {2017-10-28},
  author = {Paszke, Adam and Gross, Sam and Chintala, Soumith and Chanan, Gregory and Yang, Edward and DeVito, Zachary and Lin, Zeming and Desmaison, Alban and Antiga, Luca and Lerer, Adam},
  file = {/Users/fergalcotter/Zotero/storage/NMHQVBBX/Paszke et al. - 2017 - Automatic differentiation in PyTorch.pdf;/Users/fergalcotter/Zotero/storage/8SAJC6KD/forum.html}
}

@article{tensorflow2015-whitepaper,
  title = {{{TensorFlow}}: {{Large}}-{{Scale Machine Learning}} on {{Heterogeneous Systems}}},
  url = {https://www.tensorflow.org/},
  date = {2015},
  author = {Abadi, Mart́ın and Agarwal, Ashish and Barham, Paul and Brevdo, Eugene and Chen, Zhifeng and Citro, Craig and Corrado, Greg S. and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Goodfellow, Ian and Harp, Andrew and Irving, Geoffrey and Isard, Michael and Jia, Yangqing and Jozefowicz, Rafal and Kaiser, Lukasz and Kudlur, Manjunath and Levenberg, Josh and Mané, Dandelion and Monga, Rajat and Moore, Sherry and Murray, Derek and Olah, Chris and Schuster, Mike and Shlens, Jonathon and Steiner, Benoit and Sutskever, Ilya and Talwar, Kunal and Tucker, Paul and Vanhoucke, Vincent and Vasudevan, Vijay and Viégas, Fernanda and Vinyals, Oriol and Warden, Pete and Wattenberg, Martin and Wicke, Martin and Yu, Yuan and Zheng, Xiaoqiang},
  note = {Software available from tensorflow.org}
}

@article{tenllado_parallel_2008,
  title = {Parallel {{Implementation}} of the {{2D Discrete Wavelet Transform}} on {{Graphics Processing Units}}: {{Filter Bank}} versus {{Lifting}}},
  volume = {19},
  issn = {1045-9219},
  doi = {10.1109/TPDS.2007.70716},
  shorttitle = {Parallel {{Implementation}} of the {{2D Discrete Wavelet Transform}} on {{Graphics Processing Units}}},
  abstract = {The widespread usage of the discrete wavelet transform (DWT) has motivated the development of fast DWT algorithms and their tuning on all sorts of computer systems. Several studies have compared the performance of the most popular schemes, known as filter bank scheme (FBS) and lifting scheme (LS), and have always concluded that LS is the most efficient option. However, there is no such study on streaming processors such as modern Graphics Processing Units (GPUs). Current trends have transformed these devices into powerful stream processors with enough flexibility to perform intensive and complex floating-point calculations. The opportunities opened up by these platforms, as well as the growing popularity of the DWT within the computer graphics field, make a new performance comparison of great practical interest. Our study indicates that FBS outperforms LS in current-generation GPUs. In our experiments, the actual FBS gains range between 10 percent and 140 percent, depending on the problem size and the type and length of the wavelet filter. Moreover, design trends suggest higher gains in future-generation GPUs.},
  number = {3},
  journaltitle = {IEEE Transactions on Parallel and Distributed Systems},
  date = {2008-03},
  pages = {299-310},
  keywords = {Application software,Computer architecture,Discrete wavelet transforms,Streaming media,filtering theory,lifting scheme,Filter bank,Optimization,2D discrete wavelet transform,Arithmetic,computer graphics,Computer graphics,Computer Society,discrete wavelet transforms,filter bank scheme,floating-point calculations,graphics processing units,Graphics processors,Parallel algorithms,Parallel processing,Paralleland vector implementations,Parallelprocessing,SIMD processors,streaming processors,Wavelets and fractals},
  author = {Tenllado, C. and Setoain, J. and Prieto, M. and Piñuel, L. and Tirado, F.},
  file = {/Users/fergalcotter/Zotero/storage/57LFSV8K/4359416.html}
}

@article{fong_interpretable_2017,
  archivePrefix = {arXiv},
%  eprinttype = {arxiv},
%  eprint = {1704.03296},
  title = {Interpretable {{Explanations}} of {{Black Boxes}} by {{Meaningful Perturbation}}},
  url = {http://arxiv.org/abs/1704.03296},
  doi = {10.1109/ICCV.2017.371},
  abstract = {As machine learning algorithms are increasingly applied to high impact yet high risk tasks, such as medical diagnosis or autonomous driving, it is critical that researchers can explain how such algorithms arrived at their predictions. In recent years, a number of image saliency methods have been developed to summarize where highly complex neural networks "look" in an image for evidence for their predictions. However, these techniques are limited by their heuristic nature and architectural constraints. In this paper, we make two main contributions: First, we propose a general framework for learning different kinds of explanations for any black box algorithm. Second, we specialise the framework to find the part of an image most responsible for a classifier decision. Unlike previous works, our method is model-agnostic and testable because it is grounded in explicit and interpretable image perturbations.},
  journaltitle = {2017 IEEE International Conference on Computer Vision (ICCV)},
  urldate = {2019-05-22},
  date = {2017-10},
  pages = {3449-3457},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Statistics - Machine Learning,Computer Science - Machine Learning},
  author = {Fong, Ruth and Vedaldi, Andrea},
  file = {/Users/fergalcotter/Dropbox/Papers/Fong_Vedaldi_2017_Interpretable Explanations of Black Boxes by Meaningful Perturbation.pdf;/Users/fergalcotter/Zotero/storage/RL4SVGFK/1704.html}
}

@article{zhou_object_2014,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1412.6856},
  primaryClass = {cs},
  title = {Object {{Detectors Emerge}} in {{Deep Scene CNNs}}},
  url = {http://arxiv.org/abs/1412.6856},
  abstract = {With the success of new computational architectures for visual processing, such as convolutional neural networks (CNN) and access to image databases with millions of labeled examples (e.g., ImageNet, Places), the state of the art in computer vision is advancing rapidly. One important factor for continued progress is to understand the representations that are learned by the inner layers of these deep architectures. Here we show that object detectors emerge from training CNNs to perform scene classification. As scenes are composed of objects, the CNN for scene classification automatically discovers meaningful objects detectors, representative of the learned scene categories. With object detectors emerging as a result of learning to recognize scenes, our work demonstrates that the same network can perform both scene recognition and object localization in a single forward-pass, without ever having been explicitly taught the notion of objects.},
  urldate = {2019-06-03},
  date = {2014-12-21},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Neural and Evolutionary Computing},
  author = {Zhou, Bolei and Khosla, Aditya and Lapedriza, Agata and Oliva, Aude and Torralba, Antonio},
  file = {/Users/fergalcotter/Dropbox/Papers/Zhou et al_2014_Object Detectors Emerge in Deep Scene CNNs.pdf;/Users/fergalcotter/Zotero/storage/9ILDAMI2/1412.html}
}

@article{he_bag_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1812.01187},
  primaryClass = {cs},
  title = {Bag of {{Tricks}} for {{Image Classification}} with {{Convolutional Neural Networks}}},
  url = {http://arxiv.org/abs/1812.01187},
  abstract = {Much of the recent progress made in image classification research can be credited to training procedure refinements, such as changes in data augmentations and optimization methods. In the literature, however, most refinements are either briefly mentioned as implementation details or only visible in source code. In this paper, we will examine a collection of such refinements and empirically evaluate their impact on the final model accuracy through ablation study. We will show that, by combining these refinements together, we are able to improve various CNN models significantly. For example, we raise ResNet-50's top-1 validation accuracy from 75.3\% to 79.29\% on ImageNet. We will also demonstrate that improvement on image classification accuracy leads to better transfer learning performance in other application domains such as object detection and semantic segmentation.},
  urldate = {2019-06-04},
  date = {2018-12-03},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
  file = {/Users/fergalcotter/Dropbox/Papers/He et al_2018_Bag of Tricks for Image Classification with Convolutional Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/MNAFQ3X6/1812.html}
}

@book{nocedal_numerical_2006,
  title = {Numerical {{Optimization}}},
  isbn = {978-0-387-30303-1},
  publisher = {{Springer New York}},
  date = {2006},
  author = {Nocedal, Jorge and Wright, S.},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/Numerical_Optimization.pdf}
}

@inproceedings{martens_deep_2010,
  location = {{USA}},
  title = {Deep {{Learning}} via {{Hessian}}-Free {{Optimization}}},
  isbn = {978-1-60558-907-7},
  url = {http://dl.acm.org/citation.cfm?id=3104322.3104416},
  abstract = {We develop a 2nd-order optimization method based on the "Hessian-free" approach, and apply it to training deep auto-encoders. Without using pre-training, we obtain results superior to those reported by Hinton \& Salakhutdinov (2006) on the same tasks they considered. Our method is practical, easy to use, scales nicely to very large datasets, and isn't limited in applicability to auto-encoders, or any specific model class. We also discuss the issue of "pathological curvature" as a possible explanation for the difficulty of deep-learning and how 2nd-order optimization, and our method in particular, effectively deals with it.},
  booktitle = {Proceedings of the 27th {{International Conference}} on {{International Conference}} on {{Machine Learning}}},
  series = {{{ICML}}'10},
  publisher = {{Omnipress}},
  urldate = {2019-06-10},
  date = {2010},
  pages = {735--742},
  author = {Martens, James},
  file = {/Users/fergalcotter/Dropbox/Papers/Deep_HessianFree.pdf},
  venue = {Haifa, Israel}
}

@article{wilson_general_2003,
  langid = {english},
  title = {The General Inefficiency of Batch Training for Gradient Descent Learning},
  volume = {16},
  issn = {0893-6080},
  doi = {10.1016/S0893-6080(03)00138-2},
  abstract = {Gradient descent training of neural networks can be done in either a batch or on-line manner. A widely held myth in the neural network community is that batch training is as fast or faster and/or more 'correct' than on-line training because it supposedly uses a better approximation of the true gradient for its weight updates. This paper explains why batch training is almost always slower than on-line training-often orders of magnitude slower-especially on large training sets. The main reason is due to the ability of on-line training to follow curves in the error surface throughout each epoch, which allows it to safely use a larger learning rate and thus converge with less iterations through the training data. Empirical results on a large (20,000-instance) speech recognition task and on 26 other learning tasks demonstrate that convergence can be reached significantly faster using on-line training than batch training, with no apparent difference in accuracy.},
  number = {10},
  journaltitle = {Neural Networks: The Official Journal of the International Neural Network Society},
  shortjournal = {Neural Netw},
  date = {2003-12},
  pages = {1429-1451},
  keywords = {Algorithms,Humans,Time Factors,Generalization (Psychology),Learning,Neural Networks (Computer),Online Systems,Psychological Tests,Recognition (Psychology),Review Literature as Topic,Speech Perception,Stochastic Processes,Teaching},
  author = {Wilson, D. Randall and Martinez, Tony R.},
  file = {/Users/fergalcotter/Zotero/storage/XDFY35I2/Wilson and Martinez - 2003 - The general inefficiency of batch training for gra.pdf},
  eprinttype = {pmid},
  eprint = {14622875}
}

@inproceedings{smith_dont_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1711.00489},
  primaryClass = {cs, stat},
  title = {Don't {{Decay}} the {{Learning Rate}}, {{Increase}} the {{Batch Size}}},
  url = {http://arxiv.org/abs/1711.00489},
  abstract = {It is common practice to decay the learning rate. Here we show one can usually obtain the same learning curve on both training and test sets by instead increasing the batch size during training. This procedure is successful for stochastic gradient descent (SGD), SGD with momentum, Nesterov momentum, and Adam. It reaches equivalent test accuracies after the same number of training epochs, but with fewer parameter updates, leading to greater parallelism and shorter training times. We can further reduce the number of parameter updates by increasing the learning rate \$\textbackslash{}epsilon\$ and scaling the batch size \$B \textbackslash{}propto \textbackslash{}epsilon\$. Finally, one can increase the momentum coefficient \$m\$ and scale \$B \textbackslash{}propto 1/(1-m)\$, although this tends to slightly reduce the test accuracy. Crucially, our techniques allow us to repurpose existing training schedules for large batch training with no hyper-parameter tuning. We train ResNet-50 on ImageNet to \$76.1\textbackslash\%\$ validation accuracy in under 30 minutes.},
  urldate = {2019-06-10},
  date = {2017-11-01},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Distributed; Parallel; and Cluster Computing,Computer Science - Machine Learning,Statistics - Machine Learning},
  author = {Smith, Samuel L. and Kindermans, Pieter-Jan and Ying, Chris and Le, Quoc V.},
  file = {/Users/fergalcotter/Zotero/storage/XCNTBIN5/Smith et al. - 2017 - Don't Decay the Learning Rate, Increase the Batch .pdf;/Users/fergalcotter/Zotero/storage/ZK54NRVQ/1711.html}
}

@article{bottou_stochastic_2012,
  langid = {american},
  title = {Stochastic {{Gradient Descent Tricks}}},
  volume = {7700},
  url = {https://www.microsoft.com/en-us/research/publication/stochastic-gradient-tricks/},
  abstract = {The first chapter of Neural Networks, Tricks of the Trade strongly advocates the stochastic back-propagation method to train neural networks. This is in fact an instance of a more general technique called stochastic gradient descent. This chapter provides background material, explains why SGD is a good learning algorithm when the training set is large, and …},
  urldate = {2019-06-10},
  date = {2012-01-01},
  author = {Bottou, Leon},
  file = {/Users/fergalcotter/Zotero/storage/SD7RPM23/Bottou - 2012 - Stochastic Gradient Descent Tricks.pdf;/Users/fergalcotter/Zotero/storage/SWC3EX7Y/stochastic-gradient-tricks.html}
}

@article{polyak_methods_1964,
  title = {Some Methods of Speeding up the Convergence of Iteration Methods},
  volume = {4},
  issn = {0041-5553},
  url = {http://www.sciencedirect.com/science/article/pii/0041555364901375},
  doi = {10.1016/0041-5553(64)90137-5},
  abstract = {For the solution of the functional equation P (x) = 0 (1) (where P is an operator, usually linear, from B into B, and B is a Banach space) iteration methods are generally used. These consist of the construction of a series x0, …, xn, …, which converges to the solution (see, for example [1]). Continuous analogues of these methods are also known, in which a trajectory x(t), 0 ⩽ t ⩽ ∞ is constructed, which satisfies the ordinary differential equation in B and is such that x(t) approaches the solution of (1) as t → ∞ (see [2]). We shall call the method a k-step method if for the construction of each successive iteration xn+1 we use k previous iterations xn, …, xn−k+1. The same term will also be used for continuous methods if x(t) satisfies a differential equation of the k-th order or k-th degree. Iteration methods which are more widely used are one-step (e.g. methods of successive approximations). They are generally simple from the calculation point of view but often converge very slowly. This is confirmed both by the evaluation of the speed of convergence and by calculation in practice (for more details see below). Therefore the question of the rate of convergence is most important. Some multistep methods, which we shall consider further, which are only slightly more complicated than the corresponding one-step methods, make it possible to speed up the convergence substantially. Note that all the methods mentioned below are applicable also to the problem of minimizing the differentiable functional (x) in Hilbert space, so long as this problem reduces to the solution of the equation grad (x) = 0.},
  number = {5},
  journaltitle = {USSR Computational Mathematics and Mathematical Physics},
  shortjournal = {USSR Computational Mathematics and Mathematical Physics},
  urldate = {2019-06-10},
  date = {1964-01-01},
  pages = {1-17},
  author = {Polyak, B. T.},
  file = {/Users/fergalcotter/Zotero/storage/J57QRCH8/Polyak - 1964 - Some methods of speeding up the convergence of ite.pdf;/Users/fergalcotter/Zotero/storage/YYUJSRDF/0041555364901375.html}
}

@article{duchi_adaptive_2011,
  title = {Adaptive {{Subgradient Methods}} for {{Online Learning}} and {{Stochastic Optimization}}},
  volume = {12},
  issn = {ISSN 1533-7928},
  url = {http://jmlr.org/papers/v12/duchi11a.html},
  issue = {Jul},
  journaltitle = {Journal of Machine Learning Research},
  urldate = {2019-06-10},
  date = {2011},
  pages = {2121-2159},
  author = {Duchi, John and Hazan, Elad and Singer, Yoram},
  file = {/Users/fergalcotter/Zotero/storage/TZDLVQ5Y/Duchi et al. - 2011 - Adaptive Subgradient Methods for Online Learning a.pdf;/Users/fergalcotter/Zotero/storage/Z9THB79Q/duchi11a.html}
}

@article{zeiler_adadelta:_2012,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1212.5701},
  primaryClass = {cs},
  title = {{{ADADELTA}}: {{An Adaptive Learning Rate Method}}},
  url = {http://arxiv.org/abs/1212.5701},
  shorttitle = {{{ADADELTA}}},
  abstract = {We present a novel per-dimension learning rate method for gradient descent called ADADELTA. The method dynamically adapts over time using only first order information and has minimal computational overhead beyond vanilla stochastic gradient descent. The method requires no manual tuning of a learning rate and appears robust to noisy gradient information, different model architecture choices, various data modalities and selection of hyperparameters. We show promising results compared to other methods on the MNIST digit classification task using a single machine and on a large scale voice dataset in a distributed cluster environment.},
  urldate = {2019-06-10},
  date = {2012-12-22},
  keywords = {Computer Science - Machine Learning},
  author = {Zeiler, Matthew D.},
  file = {/Users/fergalcotter/Zotero/storage/ECVXTZDT/Zeiler - 2012 - ADADELTA An Adaptive Learning Rate Method.pdf;/Users/fergalcotter/Zotero/storage/R54QAVG2/1212.html}
}

@article{rosenblatt_perceptron:_1958,
  title = {The Perceptron: {{A}} Probabilistic Model for Information Storage and Organization in the Brain [{{J}}]},
  volume = {65},
  doi = {10.1037/h0042519},
  shorttitle = {The Perceptron},
  abstract = {To answer the questions of how information about the physical world is sensed, in what form is information remembered, and how does information retained in memory influence recognition and behavior, a theory is developed for a hypothetical nervous system called a perceptron. The theory serves as a bridge between biophysics and psychology. It is possible to predict learning curves from neurological variables and vice versa. The quantitative statistical approach is fruitful in the understanding of the organization of cognitive systems. 18 references.},
  journaltitle = {Psychol. Review},
  shortjournal = {Psychol. Review},
  date = {1958-12-01},
  pages = {386-408},
  author = {Rosenblatt, Frank}
}

@book{minsky_perceptrons:_1988,
  location = {{Cambridge, MA, USA}},
  title = {Perceptrons: {{Expanded Edition}}},
  isbn = {978-0-262-63111-2},
  shorttitle = {Perceptrons},
  publisher = {{MIT Press}},
  date = {1988},
  author = {Minsky, Marvin L. and Papert, Seymour A.},
  file = {/Users/fergalcotter/Dropbox/Papers/Books/MinskyPapert_Perceptron.pdf}
}

@article{cybenko_approximation_1989,
  langid = {english},
  title = {Approximation by Superpositions of a Sigmoidal Function},
  volume = {2},
  issn = {1435-568X},
  url = {https://doi.org/10.1007/BF02551274},
  doi = {10.1007/BF02551274},
  abstract = {In this paper we demonstrate that finite linear combinations of compositions of a fixed, univariate function and a set of affine functionals can uniformly approximate any continuous function ofn real variables with support in the unit hypercube; only mild conditions are imposed on the univariate function. Our results settle an open question about representability in the class of single hidden layer neural networks. In particular, we show that arbitrary decision regions can be arbitrarily well approximated by continuous feedforward neural networks with only a single internal, hidden layer and any continuous sigmoidal nonlinearity. The paper discusses approximation properties of other possible types of nonlinearities that might be implemented by artificial neural networks.},
  number = {4},
  journaltitle = {Mathematics of Control, Signals and Systems},
  shortjournal = {Math. Control Signal Systems},
  urldate = {2019-06-11},
  date = {1989-12-01},
  pages = {303-314},
  keywords = {Neural networks,Approximation,Completeness},
  author = {Cybenko, G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Cybenko - 1989 - Approximation by superpositions of a sigmoidal fun.pdf}
}

@article{hornik_multilayer_1989,
  title = {Multilayer Feedforward Networks Are Universal Approximators},
  volume = {2},
  issn = {0893-6080},
  url = {http://www.sciencedirect.com/science/article/pii/0893608089900208},
  doi = {10.1016/0893-6080(89)90020-8},
  abstract = {This paper rigorously establishes that standard multilayer feedforward networks with as few as one hidden layer using arbitrary squashing functions are capable of approximating any Borel measurable function from one finite dimensional space to another to any desired degree of accuracy, provided sufficiently many hidden units are available. In this sense, multilayer feedforward networks are a class of universal approximators.},
  number = {5},
  journaltitle = {Neural Networks},
  shortjournal = {Neural Networks},
  urldate = {2019-06-11},
  date = {1989-01-01},
  pages = {359-366},
  keywords = {Back-propagation networks,Feedforward networks,Mapping networks,Network representation capability,Sigma-Pi networks,Squashing functions,Stone-Weierstrass Theorem,Universal approximation},
  author = {Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert},
  file = {/Users/fergalcotter/Zotero/storage/5P3AV5WD/Hornik et al. - 1989 - Multilayer feedforward networks are universal appr.pdf;/Users/fergalcotter/Zotero/storage/ZH5ABYL4/0893608089900208.html}
}

@incollection{widrow_neurocomputing:_1988,
  location = {{Cambridge, MA, USA}},
  title = {Neurocomputing: {{Foundations}} of {{Research}}},
  isbn = {978-0-262-01097-9},
  url = {http://dl.acm.org/citation.cfm?id=65669.104390},
  shorttitle = {Neurocomputing},
  publisher = {{MIT Press}},
  urldate = {2019-06-11},
  date = {1988},
  pages = {123--134},
  author = {Widrow, Bernard and Hoff, Marcian E.},
  editor = {Anderson, James A. and Rosenfeld, Edward},
  file = {/Users/fergalcotter/Dropbox/Papers/Widrow and Hoff - 1988 - Neurocomputing Foundations of Research.pdf}
}

@inproceedings{xie_aggregated_2017,
  location = {{Honolulu, HI, USA}},
  title = {Aggregated {{Residual Transformations}} for {{Deep Neural Networks}}},
  doi = {10.1109/CVPR.2017.634},
  abstract = {We present a simple, highly modularized network architecture for image classification. Our network is constructed by repeating a building block that aggregates a set of transformations with the same topology. Our simple design results in a homogeneous, multi-branch architecture that has only a few hyper-parameters to set. This strategy exposes a new dimension, which we call cardinality (the size of the set of transformations), as an essential factor in addition to the dimensions of depth and width. On the ImageNet-1K dataset, we empirically show that even under the restricted condition of maintaining complexity, increasing cardinality is able to improve classification accuracy. Moreover, increasing cardinality is more effective than going deeper or wider when we increase the capacity. Our models, named ResNeXt, are the foundations of our entry to the ILSVRC 2016 classification task in which we secured 2nd place. We further investigate ResNeXt on an ImageNet-5K set and the COCO detection set, also showing better results than its ResNet counterpart. The code and models are publicly available online.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2017 {{IEEE Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2017-07},
  pages = {5987-5995},
  keywords = {Computer vision,Network architecture,Neural network software},
  author = {Xie, Saining and Girshick, Ross B. and Dollár, Piotr and Tu, Zhuowen and He, Kaiming},
  file = {/Users/fergalcotter/Zotero/storage/4Z77LY5A/Xie et al. - 2017 - Aggregated Residual Transformations for Deep Neura.pdf}
}

@inproceedings{gal_dropout_2016,
  title = {Dropout {{As}} a {{Bayesian Approximation}}: {{Representing Model Uncertainty}} in {{Deep Learning}}},
  url = {http://dl.acm.org/citation.cfm?id=3045390.3045502},
  shorttitle = {Dropout {{As}} a {{Bayesian Approximation}}},
  abstract = {Deep learning tools have gained tremendous attention in applied machine learning. However such tools for regression and classification do not capture model uncertainty. In comparison, Bayesian models offer a mathematically grounded framework to reason about model uncertainty, but usually come with a prohibitive computational cost. In this paper we develop a new theoretical framework casting dropout training in deep neural networks (NNs) as approximate Bayesian inference in deep Gaussian processes. A direct result of this theory gives us tools to model uncertainty with dropout NNs - extracting information from existing models that has been thrown away so far. This mitigates the problem of representing uncertainty in deep learning without sacrificing either computational complexity or test accuracy. We perform an extensive study of the properties of dropout's uncertainty. Various network architectures and nonlinearities are assessed on tasks of regression and classification, using MNIST as an example. We show a considerable improvement in predictive log-likelihood and RMSE compared to existing state-of-the-art methods, and finish by using dropout's uncertainty in deep reinforcement learning.},
  booktitle = {Proceedings of the 33rd {{International Conference}} on {{International Conference}} on {{Machine Learning}} - {{Volume}} 48},
  series = {{{ICML}}'16},
  publisher = {{JMLR.org}},
  urldate = {2019-06-12},
  date = {2016},
  pages = {1050--1059},
  author = {Gal, Yarin and Ghahramani, Zoubin},
  venue = {New York, NY, USA}
}

@article{garnett_systematic_2012,
  langid = {english},
  title = {Systematic Identification of Genomic Markers of Drug Sensitivity in Cancer Cells},
  volume = {483},
  issn = {1476-4687},
  url = {https://www.nature.com/articles/nature11005},
  doi = {10.1038/nature11005},
  abstract = {Clinical responses to anticancer therapies are often restricted to a subset of patients. In some cases, mutated cancer genes are potent biomarkers for responses to targeted agents. Here, to uncover new biomarkers of sensitivity and resistance to cancer therapeutics, we screened a panel of several hundred cancer cell lines—which represent much of the tissue-type and genetic diversity of human cancers—with 130 drugs under clinical and preclinical investigation. In aggregate, we found that mutated cancer genes were associated with cellular response to most currently available cancer drugs. Classic oncogene addiction paradigms were modified by additional tissue-specific or expression biomarkers, and some frequently mutated genes were associated with sensitivity to a broad range of therapeutic agents. Unexpected relationships were revealed, including the marked sensitivity of Ewing’s sarcoma cells harbouring the EWS (also known as EWSR1)-FLI1 gene translocation to poly(ADP-ribose) polymerase (PARP) inhibitors. By linking drug activity to the functional complexity of cancer genomes, systematic pharmacogenomic profiling in cancer cell lines provides a powerful biomarker discovery platform to guide rational cancer therapeutic strategies.},
  number = {7391},
  journaltitle = {Nature},
  urldate = {2019-06-13},
  date = {2012-03},
  pages = {570-575},
  author = {Garnett, Mathew J. and Edelman, Elena J. and Heidorn, Sonja J. and Greenman, Chris D. and Dastur, Anahita and Lau, King Wai and Greninger, Patricia and Thompson, I. Richard and Luo, Xi and Soares, Jorge and Liu, Qingsong and Iorio, Francesco and Surdez, Didier and Chen, Li and Milano, Randy J. and Bignell, Graham R. and Tam, Ah T. and Davies, Helen and Stevenson, Jesse A. and Barthorpe, Syd and Lutz, Stephen R. and Kogera, Fiona and Lawrence, Karl and McLaren-Douglas, Anne and Mitropoulos, Xeni and Mironenko, Tatiana and Thi, Helen and Richardson, Laura and Zhou, Wenjun and Jewitt, Frances and Zhang, Tinghu and O’Brien, Patrick and Boisvert, Jessica L. and Price, Stacey and Hur, Wooyoung and Yang, Wanjuan and Deng, Xianming and Butler, Adam and Choi, Hwan Geun and Chang, Jae Won and Baselga, Jose and Stamenkovic, Ivan and Engelman, Jeffrey A. and Sharma, Sreenath V. and Delattre, Olivier and Saez-Rodriguez, Julio and Gray, Nathanael S. and Settleman, Jeffrey and Futreal, P. Andrew and Haber, Daniel A. and Stratton, Michael R. and Ramaswamy, Sridhar and McDermott, Ultan and Benes, Cyril H.},
  file = {/Users/fergalcotter/Zotero/storage/4MR6V3PK/Garnett et al. - 2012 - Systematic identification of genomic markers of dr.pdf;/Users/fergalcotter/Zotero/storage/RNIT6WLP/nature11005.html}
}

@article{holschneider_pointwise_1991,
  langid = {english},
  title = {Pointwise Analysis of {{Riemann}}'s “Nondifferentiable” Function},
  volume = {105},
  issn = {1432-1297},
  url = {https://doi.org/10.1007/BF01232261},
  doi = {10.1007/BF01232261},
  abstract = {SummaryWe will show how to analyse the local regularity of functions with the help of the wavelet transform. These results will be applied to the function of Riemann, where we show the existence of a dense set of points where this function is differentiable. On another dense set we show the existence of local singularities of cusp type. On a third set we show differentiability to the right (left). On the remaining set the functions will be shown to be not differentiable.},
  number = {1},
  journaltitle = {Inventiones mathematicae},
  shortjournal = {Invent Math},
  urldate = {2019-06-16},
  date = {1991-12-01},
  pages = {157-175},
  keywords = {Cusp Type,Local Regularity,Local Singularity,Pointwise Analysis},
  author = {Holschneider, M. and Tchamitchian, Ph.}
}

@article{chui_inequalities_1993,
  title = {Inequalities of {{Littlewood}}–{{Paley Type}} for {{Frames}} and {{Wavelets}}},
  volume = {24},
  issn = {0036-1410},
  url = {https://epubs.siam.org/doi/abs/10.1137/0524017},
  doi = {10.1137/0524017},
  abstract = {Inequalities of Littlewood–Paley type for frames in both the wavelet and Weyl–Heisenberg settings, and those for any unconditional basis of the form \$\textbackslash{}psi \_\{j,k\} (x) = 2\^\{\textbackslash{}frac\{j\}\{2\}\} \textbackslash{}psi (2\^j x - k)\$, are established. In particular, if \$\textbackslash\{ \textbackslash{}psi \_\{j,k\} \textbackslash\} \$ is a semi-orthogonal basis, then the Littlewood-Paley identity is obtained. A similar identity for the “biorthogonal wavelets” of Cohen, Daubechies, and Feauveau is also obtained.},
  number = {1},
  journaltitle = {SIAM Journal on Mathematical Analysis},
  shortjournal = {SIAM J. Math. Anal.},
  urldate = {2019-06-16},
  date = {1993-01-01},
  pages = {263-277},
  author = {Chui, C. and Shi, X.},
  file = {/Users/fergalcotter/Zotero/storage/UVHDEJTA/0524017.html}
}

@inproceedings{lecun_modified_1998,
  title = {Modified {{NIST}} Database of Handwritted Digits},
  url = {http://yann.lecun.com/exdb/mnist/},
  publisher = {{http://yann.lecun.com/exdb/mnist/}},
  date = {1998},
  author = {LeCun, Y and Cortes, C. and Burges, C.}
}

@inproceedings{krizhevsky_cifar_2009,
  title = {{{CIFAR}} Datasets},
  doi = {https://www.cs.toronto.edu/~kriz/cifar.html},
  date = {2009},
  author = {Krizhevsky, Alex and Nair, Vinod and Hinton, G.}
}

@inproceedings{stanford_vision_lab_imagenet_2017,
  title = {{{ImageNet CLS}}-{{LOC}}},
  url = {https://www.kaggle.com/c/imagenet-object-localization-challenge/data},
  publisher = {{https://www.kaggle.com/c/imagenet-object-localization-challenge/data}},
  date = {2017},
  author = {{Stanford Vision Lab}}
}

@article{Everingham15,
  title = {The {{Pascal Visual Object Classes Challenge}}: {{A Retrospective}}},
  volume = {111},
  number = {1},
  journaltitle = {International Journal of Computer Vision},
  date = {2015-01},
  pages = {98-136},
  author = {Everingham, M. and Eslami, S. M. A. and Van Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.}
}

@inproceedings{li_fei-fei_learning_2004,
  title = {Learning {{Generative Visual Models}} from {{Few Training Examples}}: {{An Incremental Bayesian Approach Tested}} on 101 {{Object Categories}}},
  doi = {10.1109/CVPR.2004.383},
  shorttitle = {Learning {{Generative Visual Models}} from {{Few Training Examples}}},
  abstract = {Current computational approaches to learning visual object categories require thousands of training images, are slow, cannot learn in an incremental manner and cannot incorporate prior information into the learning process. In addition, no algorithm presented in the literature has been tested on more than a handful of object categories. We present an method for learning object categories from just a few training images. It is quick and it uses prior information in a principled way. We test it on a dataset composed of images of objects belonging to 101 widely varied categories. Our proposed method is based on making use of prior information, assembled from (unrelated) object categories which were previously learnt. A generative probabilistic model is used, which represents the shape and appearance of a constellation of features belonging to the object. The parameters of the model are learnt incrementally in a Bayesian manner. Our incremental algorithm is compared experimentally to an earlier batch Bayesian algorithm, as well as to one based on maximum-likelihood. The incremental and batch versions have comparable classification performance on small training sets, but incremental learning is significantly faster, making real-time learning feasible. Both Bayesian methods outperform maximum likelihood on small training sets.},
  eventtitle = {2004 {{Conference}} on {{Computer Vision}} and {{Pattern Recognition Workshop}}},
  booktitle = {2004 {{Conference}} on {{Computer Vision}} and {{Pattern Recognition Workshop}}},
  date = {2004-06},
  pages = {178-178},
  keywords = {Assembly,Bayesian methods,Humans,Image databases,Image recognition,Machine vision,Maximum likelihood estimation,Parameter estimation,Shape,Testing},
  author = {{Li Fei-Fei} and Fergus, R. and Perona, P.},
  file = {/Users/fergalcotter/Zotero/storage/XQBGXNQ8/1384978.html}
}

@article{pulay_geometry_1992,
  title = {Geometry Optimization in Redundant Internal Coordinates},
  volume = {96},
  issn = {0021-9606},
  url = {https://aip.scitation.org/doi/10.1063/1.462844},
  doi = {10.1063/1.462844},
  number = {4},
  journaltitle = {The Journal of Chemical Physics},
  shortjournal = {J. Chem. Phys.},
  urldate = {2019-06-24},
  date = {1992-02-15},
  pages = {2856-2860},
  author = {Pulay, P. and Fogarasi, G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Pulay and Fogarasi - 1992 - Geometry optimization in redundant internal coordi.pdf;/Users/fergalcotter/Zotero/storage/4SYQHLGQ/1.html}
}

@article{kraskov_estimating_2004,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {cond-mat/0305641},
  title = {Estimating {{Mutual Information}}},
  volume = {69},
  issn = {1539-3755, 1550-2376},
  url = {http://arxiv.org/abs/cond-mat/0305641},
  doi = {10.1103/PhysRevE.69.066138},
  abstract = {We present two classes of improved estimators for mutual information \$M(X,Y)\$, from samples of random points distributed according to some joint probability density \$\textbackslash{}mu(x,y)\$. In contrast to conventional estimators based on binnings, they are based on entropy estimates from \$k\$-nearest neighbour distances. This means that they are data efficient (with \$k=1\$ we resolve structures down to the smallest possible scales), adaptive (the resolution is higher where data are more numerous), and have minimal bias. Indeed, the bias of the underlying entropy estimates is mainly due to non-uniformity of the density at the smallest resolved scale, giving typically systematic errors which scale as functions of \$k/N\$ for \$N\$ points. Numerically, we find that both families become \{\textbackslash{}it exact\} for independent distributions, i.e. the estimator \$\textbackslash{}hat M(X,Y)\$ vanishes (up to statistical fluctuations) if \$\textbackslash{}mu(x,y) = \textbackslash{}mu(x) \textbackslash{}mu(y)\$. This holds for all tested marginal distributions and for all dimensions of \$x\$ and \$y\$. In addition, we give estimators for redundancies between more than 2 random variables. We compare our algorithms in detail with existing algorithms. Finally, we demonstrate the usefulness of our estimators for assessing the actual independence of components obtained from independent component analysis (ICA), for improving ICA, and for estimating the reliability of blind source separation.},
  number = {6},
  journaltitle = {Physical Review E},
  shortjournal = {Phys. Rev. E},
  urldate = {2019-06-29},
  date = {2004-06-23},
  pages = {066138},
  keywords = {Condensed Matter - Disordered Systems and Neural Networks,Condensed Matter - Statistical Mechanics},
  author = {Kraskov, Alexander and Stoegbauer, Harald and Grassberger, Peter},
  file = {/Users/fergalcotter/Zotero/storage/BNYC236H/Kraskov et al. - 2004 - Estimating Mutual Information.pdf;/Users/fergalcotter/Zotero/storage/3HF8W8WP/0305641.html}
}

@inproceedings{bicchi_robotic_2000,
  title = {Robotic Grasping and Contact: A Review},
  volume = {1},
  doi = {10.1109/ROBOT.2000.844081},
  shorttitle = {Robotic Grasping and Contact},
  abstract = {In this paper, we survey the field of robotic grasping and the work that has been done in this area over the last two decades, with a slight bias toward the development of the theoretical framework and analytical results in this area.},
  eventtitle = {Proceedings 2000 {{ICRA}}. {{Millennium Conference}}. {{IEEE International Conference}} on {{Robotics}} and {{Automation}}. {{Symposia Proceedings}} ({{Cat}}. {{No}}.{{00CH37065}})},
  booktitle = {Proceedings 2000 {{ICRA}}. {{Millennium Conference}}. {{IEEE International Conference}} on {{Robotics}} and {{Automation}}. {{Symposia Proceedings}} ({{Cat}}. {{No}}.{{00CH37065}})},
  date = {2000-04},
  pages = {348-353 vol.1},
  keywords = {Humans,Actuators,contact models,dynamics,Fingers,Fixtures,force control,Friction,Grasping,grippers,Grippers,Haptic interfaces,manipulator dynamics,manipulators,reviews,robotic grasping,Service robots,Wrist},
  author = {Bicchi, A. and Kumar, V.},
  file = {/Users/fergalcotter/Dropbox/Papers/Bicchi and Kumar - 2000 - Robotic grasping and contact a review.pdf;/Users/fergalcotter/Zotero/storage/SWVP6DZU/844081.html}
}

@inproceedings{goldfeder_columbia_2009-1,
  langid = {english},
  location = {{Kobe}},
  title = {The {{Columbia}} Grasp Database},
  isbn = {978-1-4244-2788-8},
  url = {http://ieeexplore.ieee.org/document/5152709/},
  doi = {10.1109/ROBOT.2009.5152709},
  abstract = {Collecting grasp data for learning and benchmarking purposes is very expensive. It would be helpful to have a standard database of graspable objects, along with a set of stable grasps for each object, but no such database exists. In this work we show how to automate the construction of a database consisting of several hands, thousands of objects, and hundreds of thousands of grasps. Using this database, we demonstrate a novel grasp planning algorithm that exploits geometric similarity between a 3D model and the objects in the database to synthesize form closure grasps. Our contributions are this algorithm, and the database itself, which we are releasing to the community as a tool for both grasp planning and benchmarking.},
  eventtitle = {2009 {{IEEE International Conference}} on {{Robotics}} and {{Automation}} ({{ICRA}})},
  booktitle = {2009 {{IEEE International Conference}} on {{Robotics}} and {{Automation}}},
  publisher = {{IEEE}},
  urldate = {2019-07-13},
  date = {2009-05},
  pages = {1710-1716},
  author = {Goldfeder, C. and Ciocarlie, M. and {Hao Dang} and Allen, P.K.},
  file = {/Users/fergalcotter/Zotero/storage/3UT4DVBN/Goldfeder et al. - 2009 - The Columbia grasp database.pdf}
}

@inproceedings{ciocarlie_dimensionality_2007,
  title = {Dimensionality Reduction for Hand-Independent Dexterous Robotic Grasping},
  doi = {10.1109/IROS.2007.4399227},
  abstract = {In this paper, we build upon recent advances in neuroscience research which have shown that control of the human hand during grasping is dominated by movement in a configuration space of highly reduced dimensionality. We extend this concept to robotic hands and show how a similar dimensionality reduction can be defined for a number of different hand models. This framework can be used to derive planning algorithms that produce stable grasps even for highly complex hand designs. Furthermore, it offers a unified approach for controlling different hands, even if the kinematic structures of the models are significantly different. We illustrate these concepts by building a comprehensive grasp planner that can be used on a large variety of robotic hands under various constraints.},
  eventtitle = {2007 {{IEEE}}/{{RSJ International Conference}} on {{Intelligent Robots}} and {{Systems}}},
  booktitle = {2007 {{IEEE}}/{{RSJ International Conference}} on {{Intelligent Robots}} and {{Systems}}},
  date = {2007-10},
  pages = {3270-3275},
  keywords = {Humans,Neuroscience,Taxonomy,Testing,Algorithm design and analysis,Grasping,Automatic control,Data gloves,dexterous manipulators,dimensionality reduction,hand-independent dexterous robotic grasping,Intelligent robots,kinematic structures,manipulator kinematics,Orbital robotics},
  author = {Ciocarlie, M. and Goldfeder, C. and Allen, P.},
  file = {/Users/fergalcotter/Zotero/storage/NH54JL3G/Ciocarlie et al. - 2007 - Dimensionality reduction for hand-independent dext.pdf;/Users/fergalcotter/Zotero/storage/8BU74BRN/4399227.html}
}

@incollection{bottou_stochastic_2012-1,
  langid = {english},
  location = {{Berlin, Heidelberg}},
  title = {Stochastic {{Gradient Descent Tricks}}},
  isbn = {978-3-642-35289-8},
  url = {https://doi.org/10.1007/978-3-642-35289-8_25},
  abstract = {Chapter 1 strongly advocates the stochastic back-propagation method to train neural networks. This is in fact an instance of a more general technique called stochastic gradient descent (SGD). This chapter provides background material, explains why SGD is a good learning algorithm when the training set is large, and provides useful recommendations.},
  booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}: {{Second Edition}}},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer Berlin Heidelberg}},
  urldate = {2019-07-14},
  date = {2012},
  pages = {421-436},
  keywords = {Conditional Random Field,Empirical Risk,Learning Rate,Stochastic Gradient,Support Vector Machine},
  author = {Bottou, Léon},
  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
  doi = {10.1007/978-3-642-35289-8_25}
}

@article{lenz_deep_2013,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1301.3592},
  primaryClass = {cs},
  title = {Deep {{Learning}} for {{Detecting Robotic Grasps}}},
  url = {http://arxiv.org/abs/1301.3592},
  abstract = {We consider the problem of detecting robotic grasps in an RGB-D view of a scene containing objects. In this work, we apply a deep learning approach to solve this problem, which avoids time-consuming hand-design of features. This presents two main challenges. First, we need to evaluate a huge number of candidate grasps. In order to make detection fast, as well as robust, we present a two-step cascaded structure with two deep networks, where the top detections from the first are re-evaluated by the second. The first network has fewer features, is faster to run, and can effectively prune out unlikely candidate grasps. The second, with more features, is slower but has to run only on the top few detections. Second, we need to handle multimodal inputs well, for which we present a method to apply structured regularization on the weights based on multimodal group regularization. We demonstrate that our method outperforms the previous state-of-the-art methods in robotic grasp detection, and can be used to successfully execute grasps on two different robotic platforms.},
  urldate = {2019-07-14},
  date = {2013-01-16},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Robotics,Computer Science - Machine Learning},
  author = {Lenz, Ian and Lee, Honglak and Saxena, Ashutosh},
  file = {/Users/fergalcotter/Zotero/storage/8B9B63KV/Lenz et al. - 2013 - Deep Learning for Detecting Robotic Grasps.pdf;/Users/fergalcotter/Zotero/storage/VYJHGNYG/1301.html}
}

@inproceedings{yun_jiang_efficient_2011,
  title = {Efficient Grasping from {{RGBD}} Images: {{Learning}} Using a New Rectangle Representation},
  doi = {10.1109/ICRA.2011.5980145},
  shorttitle = {Efficient Grasping from {{RGBD}} Images},
  abstract = {Given an image and an aligned depth map of an object, our goal is to estimate the full 7-dimensional gripper configuration-its 3D location, 3D orientation and the gripper opening width. Recently, learning algorithms have been successfully applied to grasp novel objects-ones not seen by the robot before. While these approaches use low-dimensional representations such as a `grasping point' or a `pair of points' that are perhaps easier to learn, they only partly represent the gripper configuration and hence are sub-optimal. We propose to learn a new `grasping rectangle' representation: an oriented rectangle in the image plane. It takes into account the location, the orientation as well as the gripper opening width. However, inference with such a representation is computationally expensive. In this work, we present a two step process in which the first step prunes the search space efficiently using certain features that are fast to compute. For the remaining few cases, the second step uses advanced features to accurately select a good grasp. In our extensive experiments, we show that our robot successfully uses our algorithm to pick up a variety of novel objects.},
  eventtitle = {2011 {{IEEE International Conference}} on {{Robotics}} and {{Automation}}},
  booktitle = {2011 {{IEEE International Conference}} on {{Robotics}} and {{Automation}}},
  date = {2011-05},
  pages = {3304-3311},
  keywords = {Histograms,Image edge detection,robot vision,image representation,Complexity theory,Grasping,grippers,Grippers,3D location,3D oriented rectangle,7-dimensional gripper configuration,gripper opening width,image plane,learning algorithm,low-dimensional rectangle representation,RGBD image grasping rectangle,Robots,search space,solid modelling,Three dimensional displays},
  author = {{Yun Jiang} and Moseson, S. and Saxena, A.},
  file = {/Users/fergalcotter/Zotero/storage/2DZIHIJL/Yun Jiang et al. - 2011 - Efficient grasping from RGBD images Learning usin.pdf;/Users/fergalcotter/Zotero/storage/R9JV96XA/5980145.html}
}

@article{pinto_supersizing_2015,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1509.06825},
  primaryClass = {cs},
  title = {Supersizing {{Self}}-Supervision: {{Learning}} to {{Grasp}} from {{50K Tries}} and 700 {{Robot Hours}}},
  url = {http://arxiv.org/abs/1509.06825},
  shorttitle = {Supersizing {{Self}}-Supervision},
  abstract = {Current learning-based robot grasping approaches exploit human-labeled datasets for training the models. However, there are two problems with such a methodology: (a) since each object can be grasped in multiple ways, manually labeling grasp locations is not a trivial task; (b) human labeling is biased by semantics. While there have been attempts to train robots using trial-and-error experiments, the amount of data used in such experiments remains substantially low and hence makes the learner prone to over-fitting. In this paper, we take the leap of increasing the available training data to 40 times more than prior work, leading to a dataset size of 50K data points collected over 700 hours of robot grasping attempts. This allows us to train a Convolutional Neural Network (CNN) for the task of predicting grasp locations without severe overfitting. In our formulation, we recast the regression problem to an 18-way binary classification over image patches. We also present a multi-stage learning approach where a CNN trained in one stage is used to collect hard negatives in subsequent stages. Our experiments clearly show the benefit of using large-scale datasets (and multi-stage training) for the task of grasping. We also compare to several baselines and show state-of-the-art performance on generalization to unseen objects for grasping.},
  urldate = {2019-07-14},
  date = {2015-09-22},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Robotics,Computer Science - Machine Learning},
  author = {Pinto, Lerrel and Gupta, Abhinav},
  file = {/Users/fergalcotter/Zotero/storage/5VIHCNC6/Pinto and Gupta - 2015 - Supersizing Self-supervision Learning to Grasp fr.pdf;/Users/fergalcotter/Zotero/storage/MAVSHAK5/1509.html}
}

@inproceedings{boularias_learning_2015,
  title = {Learning to {{Manipulate Unknown Objects}} in {{Clutter}} by {{Reinforcement}}},
  isbn = {978-0-262-51129-2},
  url = {http://dl.acm.org/citation.cfm?id=2887007.2887192},
  abstract = {We present a fully autonomous robotic system for grasping objects in dense clutter. The objects are unknown and have arbitrary shapes. Therefore, we cannot rely on prior models. Instead, the robot learns online, from scratch, to manipulate the objects by trial and error. Grasping objects in clutter is significantly harder than grasping isolated objects, because the robot needs to push and move objects around in order to create sufficient space for the fingers. These pre-grasping actions do not have an immediate utility, and may result in unnecessary delays. The utility of a pre-grasping action can be measured only by looking at the complete chain of consecutive actions and effects. This is a sequential decision-making problem that can be cast in the reinforcement learning framework. We solve this problem by learning the stochastic transitions between the observed states, using nonparametric density estimation. The learned transition function is used only for re-calculating the values of the executed actions in the observed states, with different policies. Values of new state-actions are obtained by regressing the values of the executed actions. The state of the system at a given time is a depth (3D) image of the scene. We use spectral clustering for detecting the different objects in the image. The performance of our system is assessed on a robot with real-world objects.},
  booktitle = {Proceedings of the {{Twenty}}-{{Ninth AAAI Conference}} on {{Artificial Intelligence}}},
  series = {{{AAAI}}'15},
  publisher = {{AAAI Press}},
  urldate = {2019-07-14},
  date = {2015},
  pages = {1336--1342},
  author = {Boularias, Abdeslam and Bagnell, J. Andrew and Stentz, Anthony},
  file = {/Users/fergalcotter/Zotero/storage/PI5E5REY/Boularias et al. - Learning to Manipulate Unknown Objects in Clutter .pdf},
  venue = {Austin, Texas}
}

@thesis{lin_reinforcement_1992,
  location = {{Pittsburgh, PA, USA}},
  title = {Reinforcement {{Learning}} for {{Robots Using Neural Networks}}},
  abstract = {Reinforcement learning agents are adaptive, reactive, and self-supervised. The aim of this dissertation is to extend the state of the art of reinforcement learning and enable its applications to complex robot-learning problems. In particular, it focuses on two issues. First, learning from sparse and delayed reinforcement signals is hard and in general a slow process. Techniques for reducing learning time must be devised. Second, most existing reinforcement learning methods assume that the world is a Markov decision process. This assumption is too strong for many robot tasks of interest.This dissertation demonstrates how we can possibly overcome the slow learning problem and tackle non-Markovian environments, making reinforcement learning more practical for realistic robot tasks: (1) Reinforcement learning can be naturally integrated with artificial neural networks to obtain high-quality generalization, resulting in a significant learning speedup. Neural networks are used in this dissertation, and they generalize effectively even in the presence of noise and a large of binary and real-valued inputs. (2) Reinforcement learning agents can save many learning trials by using an action model, which can be learned on-line. With a model, an agent can mentally experience the effects of its actions without actually executing them. Experience replay is a simple technique that implements this idea, and is shown to be effective in reducing the number of action executions required. (3) Reinforcement learning agents can take advantage of instructive training instances provided by human teachers, resulting in a significant learning speedup. Teaching can also help learning agents avoid local optima during the search for optimal control. Simulation experiments indicate that even a small amount of teaching can save agents many learning trials. (4) Reinforcement learning agents can significantly reduce learning time by hierarchical learning--they first solve elementary learning problems and then combine solutions to the elementary problems to solve a complex problem. Simulation experiments indicate that a robot with hierarchical learning can solve a complex problem, which otherwise is hardly solvable within a reasonable time. (5) Reinforcement learning agents can deal with a wide range of non-Markovian environments by having a memory of their past. Three memory architectures are discussed. They work reasonably well for a variety of simple problems. One of them is also successfully applied to a nontrivial non-Markovian robot task.The results of this dissertation rely on computer simulation, including (1) an agent operating in a dynamic and hostile environment and (2) a mobile robot operating in a noisy and non-Markovian environment. The robot simulator is physically realistic. This dissertation concludes that it is possible to build artificial agents than can acquire complex control policies effectively by reinforcement learning.},
  institution = {{Carnegie Mellon University}},
  type = {PhD Thesis},
  date = {1992},
  author = {Lin, Long-Ji},
  file = {/Users/fergalcotter/Dropbox/Papers/Lin - 1992 - Reinforcement Learning for Robots Using Neural Net.pdf}
}

@article{sutton_learning_1988,
  langid = {english},
  title = {Learning to Predict by the Methods of Temporal Differences},
  volume = {3},
  issn = {1573-0565},
  url = {https://doi.org/10.1007/BF00115009},
  doi = {10.1007/BF00115009},
  abstract = {This article introduces a class of incremental learning procedures specialized for prediction-that is, for using past experience with an incompletely known system to predict its future behavior. Whereas conventional prediction-learning methods assign credit by means of the difference between predicted and actual outcomes, the new methods assign credit by means of the difference between temporally successive predictions. Although such temporal-difference methods have been used in Samuel's checker player, Holland's bucket brigade, and the author's Adaptive Heuristic Critic, they have remained poorly understood. Here we prove their convergence and optimality for special cases and relate them to supervised-learning methods. For most real-world prediction problems, temporal-difference methods require less memory and less peak computation than conventional methods and they produce more accurate predictions. We argue that most problems to which supervised learning is currently applied are really prediction problems of the sort to which temporal-difference methods can be applied to advantage.},
  number = {1},
  journaltitle = {Machine Learning},
  shortjournal = {Mach Learn},
  urldate = {2019-07-15},
  date = {1988-08-01},
  pages = {9-44},
  keywords = {connectionism,credit assignment,evaluation functions,Incremental learning,prediction},
  author = {Sutton, Richard S.},
  file = {/Users/fergalcotter/Zotero/storage/AXA6CJAN/Sutton - 1988 - Learning to predict by the methods of temporal dif.pdf}
}

@inproceedings{hsu_path_1997,
  title = {Path Planning in Expansive Configuration Spaces},
  volume = {3},
  doi = {10.1109/ROBOT.1997.619371},
  abstract = {We introduce the notion of expansiveness to characterize a family of robot configuration spaces whose connectivity can be effectively captured by a roadmap of randomly-sampled milestones. The analysis of expansive configuration spaces has inspired us to develop a new randomized planning algorithm. This algorithm tries to sample only the portion of the configuration space that is relevant to the current query, avoiding the cost of precomputing a roadmap for the entire configuration space. Thus, it is well-suited for problems where a single query is submitted for a given environment. The algorithm has been implemented and successfully applied to complex assembly maintainability problems from the automotive industry.},
  eventtitle = {Proceedings of {{International Conference}} on {{Robotics}} and {{Automation}}},
  booktitle = {Proceedings of {{International Conference}} on {{Robotics}} and {{Automation}}},
  date = {1997-04},
  pages = {2719-2726 vol.3},
  keywords = {random processes,Joining processes,Sampling methods,Service robots,Orbital robotics,assembling,assembly,automobile industry,Automotive engineering,automotive industry,Buildings,Computational geometry,Costs,expansive configuration spaces,industrial manipulators,path planning,Path planning,random sampling,roadmap planner,Robotic assembly,robots},
  author = {Hsu, D. and Latombe, J.- and Motwani, R.},
  file = {/Users/fergalcotter/Zotero/storage/T84GJN7Z/Hsu et al. - 1997 - Path planning in expansive configuration spaces.pdf;/Users/fergalcotter/Zotero/storage/JELBLMG7/619371.html;/Users/fergalcotter/Zotero/storage/QGX5P2MU/619371.html}
}

@inproceedings{phillips_guided_2004,
  title = {Guided {{Expansive Spaces Trees}}: A Search Strategy for Motion- and Cost-Constrained State Spaces},
  volume = {2004},
  isbn = {978-0-7803-8232-9},
  doi = {10.1109/ROBOT.2004.1308890},
  shorttitle = {Guided {{Expansive Spaces Trees}}},
  abstract = {Motion planning for systems with constraints on controls or the need for relatively straight paths for real-time actions presents challenges for modern planners. This paper presents an approach which addresses these types of systems by building on existing motion planning approaches. Guided Expansive Spaces Trees are introduced to search for a low cost and relatively straight path in a space with motion constraints. Path Gradient Descent, which builds on the idea of Elastic Strips, finds the locally optimal path for an existing path. These techniques are tested on simulations of rendezvous and docking of the space shuttle to the International Space Station and of a 4-foot fan-controlled blimp in a factory setting.},
  eventtitle = {Proceedings - {{IEEE International Conference}} on {{Robotics}} and {{Automation}}},
  date = {2004-01-01},
  pages = {3968-3973 Vol.4},
  author = {Phillips, J.M. and Bedrossian, Nazareth and Kavraki, Lydia},
  file = {/Users/fergalcotter/Zotero/storage/F6KMYPP7/Phillips et al. - 2004 - Guided Expansive Spaces Trees a search strategy f.pdf}
}

@inproceedings{ross_reduction_2011,
  location = {{Ft. Lauderdale, FL, USA}},
  title = {A {{Reduction}} of {{Imitation Learning}} and {{Structured Prediction}} to {{No}}-{{Regret Online Learning}}},
  abstract = {Sequential prediction problems such as imitation learning, where future observations depend on previous predictions (actions), violate the common i.i.d. assumptions made in statistical learning. This leads to poor performance in theory and often in practice. Some recent approaches (Daume III et al., 2009; Ross and Bagnell, 2010) provide stronger guarantees in this setting, but remain somewhat unsatisfactory as they train either non-stationary or stochastic policies and require a large number of iterations. In this paper, we propose a new iterative algorithm, which trains a stationary deterministic policy, that can be seen as a no regret algorithm in an online learning setting. We show that any such no regret algorithm, combined with additional reduction assumptions, must find a policy with good performance under the distribution of observations it induces in such sequential settings. We demonstrate that this new approach outperforms previous approaches on two challenging imitation learning problems and a benchmark sequence labeling problem.},
  eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
  booktitle = {Proceedings of {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
  date = {2011-04},
  keywords = {Algorithm,Benchmark (computing),I. Michael Ross,Iteration,Iterative method,Machine learning,Regret (decision theory),Sequence labeling,Stationary process,Structured prediction,Word lists by frequency},
  author = {Ross, Stéphane and Gordon, Geoffrey J. and Bagnell, J. Andrew},
  file = {/Users/fergalcotter/Zotero/storage/UCKH3HN9/Ross et al. - 2011 - A Reduction of Imitation Learning and Structured P.pdf}
}

@article{pinto_supervision_2016,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1610.01685},
  primaryClass = {cs},
  title = {Supervision via {{Competition}}: {{Robot Adversaries}} for {{Learning Tasks}}},
  url = {http://arxiv.org/abs/1610.01685},
  shorttitle = {Supervision via {{Competition}}},
  abstract = {There has been a recent paradigm shift in robotics to data-driven learning for planning and control. Due to large number of experiences required for training, most of these approaches use a self-supervised paradigm: using sensors to measure success/failure. However, in most cases, these sensors provide weak supervision at best. In this work, we propose an adversarial learning framework that pits an adversary against the robot learning the task. In an effort to defeat the adversary, the original robot learns to perform the task with more robustness leading to overall improved performance. We show that this adversarial framework forces the the robot to learn a better grasping model in order to overcome the adversary. By grasping 82\% of presented novel objects compared to 68\% without an adversary, we demonstrate the utility of creating adversaries. We also demonstrate via experiments that having robots in adversarial setting might be a better learning strategy as compared to having collaborative multiple robots.},
  urldate = {2019-07-15},
  date = {2016-10-05},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Robotics,Computer Science - Machine Learning},
  author = {Pinto, Lerrel and Davidson, James and Gupta, Abhinav},
  file = {/Users/fergalcotter/Zotero/storage/NLIDN7FG/Pinto et al. - 2016 - Supervision via Competition Robot Adversaries for.pdf;/Users/fergalcotter/Zotero/storage/8JMYGA9G/1610.html}
}

@article{levine_learning_2018,
  langid = {english},
  title = {Learning Hand-Eye Coordination for Robotic Grasping with Deep Learning and Large-Scale Data Collection},
  volume = {37},
  issn = {0278-3649},
  url = {https://doi.org/10.1177/0278364917710318},
  doi = {10.1177/0278364917710318},
  abstract = {We describe a learning-based approach to hand-eye coordination for robotic grasping from monocular images. To learn hand-eye coordination for grasping, we trained a large convolutional neural network to predict the probability that task-space motion of the gripper will result in successful grasps, using only monocular camera images independent of camera calibration or the current robot pose. This requires the network to observe the spatial relationship between the gripper and objects in the scene, thus learning hand-eye coordination. We then use this network to servo the gripper in real time to achieve successful grasps. We describe two large-scale experiments that we conducted on two separate robotic platforms. In the first experiment, about 800,000 grasp attempts were collected over the course of two months, using between 6 and 14 robotic manipulators at any given time, with differences in camera placement and gripper wear and tear. In the second experiment, we used a different robotic platform and 8 robots to collect a dataset consisting of over 900,000 grasp attempts. The second robotic platform was used to test transfer between robots, and the degree to which data from a different set of robots can be used to aid learning. Our experimental results demonstrate that our approach achieves effective real-time control, can successfully grasp novel objects, and corrects mistakes by continuous servoing. Our transfer experiment also illustrates that data from different robots can be combined to learn more reliable and effective grasping.},
  number = {4-5},
  journaltitle = {The International Journal of Robotics Research},
  shortjournal = {The International Journal of Robotics Research},
  urldate = {2019-07-15},
  date = {2018-04-01},
  pages = {421-436},
  author = {Levine, Sergey and Pastor, Peter and Krizhevsky, Alex and Ibarz, Julian and Quillen, Deirdre},
  file = {/Users/fergalcotter/Zotero/storage/ZLFKDFHU/Levine et al. - 2018 - Learning hand-eye coordination for robotic graspin.pdf}
}

@report{sae_international_automated_2014,
  title = {Automated {{Driving Levels}} of Driving Automation Defined in {{SAE Standard J3016}}},
  date = {2014},
  author = {{SAE International}},
  file = {/Users/fergalcotter/Zotero/storage/JEH8M5BS/automated_driving.pdf}
}

@article{han_pre-trained_2017,
  langid = {english},
  title = {Pre-{{Trained AlexNet Architecture}} with {{Pyramid Pooling}} and {{Supervision}} for {{High Spatial Resolution Remote Sensing Image Scene Classification}}},
  volume = {9},
  url = {https://www.mdpi.com/2072-4292/9/8/848},
  doi = {10.3390/rs9080848},
  abstract = {The rapid development of high spatial resolution (HSR) remote sensing imagery techniques not only provide a considerable amount of datasets for scene classification tasks but also request an appropriate scene classification choice when facing with finite labeled samples. AlexNet, as a relatively simple convolutional neural network (CNN) architecture, has obtained great success in scene classification tasks and has been proven to be an excellent foundational hierarchical and automatic scene classification technique. However, current HSR remote sensing imagery scene classification datasets always have the characteristics of small quantities and simple categories, where the limited annotated labeling samples easily cause non-convergence. For HSR remote sensing imagery, multi-scale information of the same scenes can represent the scene semantics to a certain extent but lacks an efficient fusion expression manner. Meanwhile, the current pre-trained AlexNet architecture lacks a kind of appropriate supervision for enhancing the performance of this model, which easily causes overfitting. In this paper, an improved pre-trained AlexNet architecture named pre-trained AlexNet-SPP-SS has been proposed, which incorporates the scale pooling—spatial pyramid pooling (SPP) and side supervision (SS) to improve the above two situations. Extensive experimental results conducted on the UC Merced dataset and the Google Image dataset of SIRI-WHU have demonstrated that the proposed pre-trained AlexNet-SPP-SS model is superior to the original AlexNet architecture as well as the traditional scene classification methods.},
  number = {8},
  journaltitle = {Remote Sensing},
  urldate = {2019-07-21},
  date = {2017-08},
  pages = {848},
  keywords = {convolutional neural network,high spatial resolution remote sensing imagery,pre-trained AlexNet,scene classification,side supervision,spatial pyramid pooling},
  author = {Han, Xiaobing and Zhong, Yanfei and Cao, Liqin and Zhang, Liangpei},
  file = {/Users/fergalcotter/Zotero/storage/J4N2EZ3V/Han et al. - 2017 - Pre-Trained AlexNet Architecture with Pyramid Pool.pdf;/Users/fergalcotter/Zotero/storage/ZVLAIBCS/848.html}
}

@article{sanchez_image_2013,
  langid = {english},
  title = {Image {{Classification}} with the {{Fisher Vector}}: {{Theory}} and {{Practice}}},
  volume = {105},
  issn = {1573-1405},
  url = {https://doi.org/10.1007/s11263-013-0636-x},
  doi = {10.1007/s11263-013-0636-x},
  shorttitle = {Image {{Classification}} with the {{Fisher Vector}}},
  abstract = {A standard approach to describe an image for classification and retrieval purposes is to extract a set of local patch descriptors, encode them into a high dimensional vector and pool them into an image-level signature. The most common patch encoding strategy consists in quantizing the local descriptors into a finite set of prototypical elements. This leads to the popular Bag-of-Visual words representation. In this work, we propose to use the Fisher Kernel framework as an alternative patch encoding strategy: we describe patches by their deviation from an “universal” generative Gaussian mixture model. This representation, which we call Fisher vector has many advantages: it is efficient to compute, it leads to excellent results even with efficient linear classifiers, and it can be compressed with a minimal loss of accuracy using product quantization. We report experimental results on five standard datasets—PASCAL VOC 2007, Caltech 256, SUN 397, ILSVRC 2010 and ImageNet10K—with up to 9M images and 10K classes, showing that the FV framework is a state-of-the-art patch encoding technique.},
  number = {3},
  journaltitle = {International Journal of Computer Vision},
  shortjournal = {Int J Comput Vis},
  urldate = {2019-07-21},
  date = {2013-12-01},
  pages = {222-245},
  keywords = {Bag-of-Visual words,Fisher kernel,Fisher vector,Image classification,Large-scale classification,Product quantization},
  author = {Sánchez, Jorge and Perronnin, Florent and Mensink, Thomas and Verbeek, Jakob},
  file = {/Users/fergalcotter/Zotero/storage/6IDEYJQX/Sánchez et al. - 2013 - Image Classification with the Fisher Vector Theor.pdf}
}

@inproceedings{sanchez_high-dimensional_2011,
  location = {{Colorado Springs}},
  title = {High-Dimensional Signature Compression for Large-Scale Image Classification},
  doi = {10.1109/CVPR.2011.5995504},
  abstract = {We address image classification on a large-scale, i.e. when a large number of images and classes are involved. First, we study classification accuracy as a function of the image signature dimensionality and the training set size. We show experimentally that the larger the training set, the higher the impact of the dimensionality on the accuracy. In other words, high-dimensional signatures are important to obtain state-of-the-art results on large datasets. Second, we tackle the problem of data compression on very large signatures (on the order of 105 dimensions) using two lossy compression strategies: a dimensionality reduction technique known as the hash kernel and an encoding technique based on product quantizers. We explain how the gain in storage can be traded against a loss in accuracy and/or an increase in CPU cost. We report results on two large databases - ImageNet and a dataset of lM Flickr images - showing that we can reduce the storage of our signatures by a factor 64 to 128 with little loss in accuracy. Integrating the decompression in the classifier learning yields an efficient and scalable training algorithm. On ILSVRC2010 we report a 74.3\% accuracy at top-5, which corresponds to a 2.5\% absolute improvement with respect to the state-of-the-art. On a subset of 10K classes of ImageNet we report a top-1 accuracy of 16.7\%, a relative improvement of 160\% with respect to the state-of-the-art.},
  eventtitle = {2011 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  booktitle = {Proceedings of 2011 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  publisher = {{IEEE}},
  date = {2011-06},
  pages = {1665-1672},
  keywords = {Accuracy,classifier learning,data compression,Data compression,dimensionality reduction technique,Encoding,encoding technique,handwriting recognition,hash kernel technique,image classification,image coding,Image coding,Kernel,learning (artificial intelligence),lM Flickr images,lossy compression strategy,product quantizers,scalable training algorithm,signature compression,Training,vector quantisation,Vectors},
  author = {Sanchez, J. and Perronnin, F.},
  file = {/Users/fergalcotter/Zotero/storage/57YW3MV2/5995504.html}
}

@book{daubechies_ten_1992,
  location = {{Philadelphia, PA, USA}},
  title = {Ten {{Lectures}} on {{Wavelets}}},
  isbn = {978-0-89871-274-2},
  publisher = {{Society for Industrial and Applied Mathematics}},
  date = {1992},
  author = {Daubechies, Ingrid},
  file = {/Users/fergalcotter/Dropbox/Papers/Daubechies_1992_Ten Lectures on Wavelets.pdf}
}

@article{cotter_learnable_2019,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1903.03137},
  primaryClass = {cs},
  title = {A {{Learnable ScatterNet}}: {{Locally Invariant Convolutional Layers}}},
  url = {http://arxiv.org/abs/1903.03137},
  shorttitle = {A {{Learnable ScatterNet}}},
  abstract = {In this paper we explore tying together the ideas from Scattering Transforms and Convolutional Neural Networks (CNN) for Image Analysis by proposing a learnable ScatterNet. Previous attempts at tying them together in hybrid networks have tended to keep the two parts separate, with the ScatterNet forming a fixed front end and a CNN forming a learned backend. We instead look at adding learning between scattering orders, as well as adding learned layers before the ScatterNet. We do this by breaking down the scattering orders into single convolutional-like layers we call 'locally invariant' layers, and adding a learned mixing term to this layer. Our experiments show that these locally invariant layers can improve accuracy when added to either a CNN or a ScatterNet. We also discover some surprising results in that the ScatterNet may be best positioned after one or more layers of learning rather than at the front of a neural network.},
  urldate = {2019-07-22},
  date = {2019-03-07},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  author = {Cotter, Fergal and Kingsbury, Nick},
  file = {/Users/fergalcotter/Zotero/storage/TVEZYNKS/Cotter and Kingsbury - 2019 - A Learnable ScatterNet Locally Invariant Convolut.pdf;/Users/fergalcotter/Zotero/storage/8RB4K8KR/1903.html}
}

@software{cotter_learnable_2019-1,
%  location = {{GitHub fbcotter/scatnet\_learn}},
  title = {Learnable {{ScatterNet}} Software},
  language = {Python},
  url = {https://github.com/fbcotter/scatnet_learn},
  date = {2019},
  author = {Cotter, Fergal}
}

@inproceedings{saxe_information_2018,
  location = {{Vancouver, BC, Canada}},
  title = {On the {{Information Bottleneck Theory}} of {{Deep Learning}}},
  url = {https://openreview.net/forum?id=ry_WPG-A-},
  abstract = {The practical successes of deep neural networks have not been matched by theoretical progress that satisfyingly explains their behavior. In this work, we study the information bottleneck (IB)...},
  eventtitle = {{{ICLR}}},
  booktitle = {6th {{International Conference}} on {{Learning Representations}}},
  urldate = {2019-07-23},
  date = {2018-04-30},
  author = {Saxe, Andrew Michael and Bansal, Yamini and Dapello, Joel and Advani, Madhu and Kolchinsky, Artemy and Tracey, Brendan Daniel and Cox, David Daniel},
  file = {/Users/fergalcotter/Zotero/storage/4E3DZP9R/Saxe et al. - 2018 - On the Information Bottleneck Theory of Deep Learn.pdf;/Users/fergalcotter/Zotero/storage/B9SLB6ZA/forum.html}
}

@article{advani_high-dimensional_2017,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1710.03667},
  primaryClass = {physics, q-bio, stat},
  title = {High-Dimensional Dynamics of Generalization Error in Neural Networks},
  url = {http://arxiv.org/abs/1710.03667},
  abstract = {We perform an average case analysis of the generalization dynamics of large neural networks trained using gradient descent. We study the practically-relevant "high-dimensional" regime where the number of free parameters in the network is on the order of or even larger than the number of examples in the dataset. Using random matrix theory and exact solutions in linear models, we derive the generalization error and training error dynamics of learning and analyze how they depend on the dimensionality of data and signal to noise ratio of the learning problem. We find that the dynamics of gradient descent learning naturally protect against overtraining and overfitting in large networks. Overtraining is worst at intermediate network sizes, when the effective number of free parameters equals the number of samples, and thus can be reduced by making a network smaller or larger. Additionally, in the high-dimensional regime, low generalization error requires starting with small initial weights. We then turn to non-linear neural networks, and show that making networks very large does not harm their generalization performance. On the contrary, it can in fact reduce overtraining, even without early stopping or regularization of any sort. We identify two novel phenomena underlying this behavior in overcomplete models: first, there is a frozen subspace of the weights in which no learning occurs under gradient descent; and second, the statistical properties of the high-dimensional regime yield better-conditioned input correlations which protect against overtraining. We demonstrate that naive application of worst-case theories such as Rademacher complexity are inaccurate in predicting the generalization performance of deep neural networks, and derive an alternative bound which incorporates the frozen subspace and conditioning effects and qualitatively matches the behavior observed in simulation.},
  urldate = {2019-07-23},
  date = {2017-10-10},
  keywords = {Quantitative Biology - Neurons and Cognition,Statistics - Machine Learning,Computer Science - Machine Learning,Physics - Data Analysis; Statistics and Probability},
  author = {Advani, Madhu S. and Saxe, Andrew M.},
  file = {/Users/fergalcotter/Zotero/storage/QBHGVJYE/Advani and Saxe - 2017 - High-dimensional dynamics of generalization error .pdf;/Users/fergalcotter/Zotero/storage/KGPYCNGC/1710.html}
}

@inproceedings{engstrom_rotation_2017,
  location = {{Long Beach, CA, USA}},
  title = {A {{Rotation}} and a {{Translation Suffice}}: {{Fooling CNNs}} with {{Simple Transformations}}},
  url = {https://openreview.net/forum?id=BJfvknCqFQ},
  shorttitle = {A {{Rotation}} and a {{Translation Suffice}}},
  abstract = {We show that simple spatial transformations, namely translations and rotations alone, suffice to fool neural networks on a significant fraction of their inputs in multiple image classification...},
  eventtitle = {{{NIPS}}},
  booktitle = {{{NIPS}} 2017 {{Workshop}} on {{Machine Learning}} and {{Computer Security}}},
  urldate = {2019-07-24},
  date = {2017-12-08},
  author = {Engstrom, Logan and Tran, Brandon and Tsipras, Dimitris and Schmidt, Ludwig and Madry, Aleksander},
  file = {/Users/fergalcotter/Zotero/storage/IZ3IEZPW/Engstrom et al. - 2018 - A Rotation and a Translation Suffice Fooling CNNs.pdf;/Users/fergalcotter/Zotero/storage/XVTC3KJR/forum.html}
}

@article{balestriero_mad_2018,
  archivePrefix = {arXiv},
  eprinttype = {arxiv},
  eprint = {1805.06576},
  primaryClass = {cs, stat},
  title = {Mad {{Max}}: {{Affine Spline Insights}} into {{Deep Learning}}},
  url = {http://arxiv.org/abs/1805.06576},
  shorttitle = {Mad {{Max}}},
  abstract = {We build a rigorous bridge between deep networks (DNs) and approximation theory via spline functions and operators. Our key result is that a large class of DNs can be written as a composition of max-affine spline operators (MASOs), which provide a powerful portal through which to view and analyze their inner workings. For instance, conditioned on the input signal, the output of a MASO DN can be written as a simple affine transformation of the input. This implies that a DN constructs a set of signal-dependent, class-specific templates against which the signal is compared via a simple inner product; we explore the links to the classical theory of optimal classification via matched filters and the effects of data memorization. Going further, we propose a simple penalty term that can be added to the cost function of any DN learning algorithm to force the templates to be orthogonal with each other; this leads to significantly improved classification performance and reduced overfitting with no change to the DN architecture. The spline partition of the input signal space that is implicitly induced by a MASO directly links DNs to the theory of vector quantization (VQ) and \$K\$-means clustering, which opens up new geometric avenue to study how DNs organize signals in a hierarchical fashion. To validate the utility of the VQ interpretation, we develop and validate a new distance metric for signals and images that quantifies the difference between their VQ encodings. (This paper is a significantly expanded version of A Spline Theory of Deep Learning from ICML 2018.)},
  urldate = {2019-07-25},
  date = {2018-05-16},
  keywords = {Statistics - Machine Learning,Computer Science - Machine Learning},
  author = {Balestriero, Randall and Baraniuk, Richard},
  file = {/Users/fergalcotter/Zotero/storage/L5LH3H82/Balestriero and Baraniuk - 2018 - Mad Max Affine Spline Insights into Deep Learning.pdf;/Users/fergalcotter/Zotero/storage/6DDNCRZD/1805.html}
}

@inproceedings{papernot_practical_2017,
  archivePrefix = {arXiv},
%  eprinttype = {arxiv},
%  eprint = {1602.02697},
  location = {{Abu Dhabi, UAE}},
  title = {Practical {{Black}}-{{Box Attacks}} against {{Machine Learning}}},
  doi = {10.1145/3052973.3053009},
  abstract = {Machine learning (ML) models, e.g., deep neural networks (DNNs), are vulnerable to adversarial examples: malicious inputs modified to yield erroneous model outputs, while appearing unmodified to human observers. Potential attacks include having malicious content like malware identified as legitimate or controlling vehicle behavior. Yet, all existing adversarial example attacks require knowledge of either the model internals or its training data. We introduce the first practical demonstration of an attacker controlling a remotely hosted DNN with no such knowledge. Indeed, the only capability of our black-box adversary is to observe labels given by the DNN to chosen inputs. Our attack strategy consists in training a local model to substitute for the target DNN, using inputs synthetically generated by an adversary and labeled by the target DNN. We use the local substitute to craft adversarial examples, and find that they are misclassified by the targeted DNN. To perform a real-world and properly-blinded evaluation, we attack a DNN hosted by MetaMind, an online deep learning API. We find that their DNN misclassifies 84.24\% of the adversarial examples crafted with our substitute. We demonstrate the general applicability of our strategy to many ML techniques by conducting the same attack against models hosted by Amazon and Google, using logistic regression substitutes. They yield adversarial examples misclassified by Amazon and Google at rates of 96.19\% and 88.94\%. We also find that this black-box attack strategy is capable of evading defense strategies previously found to make adversarial example crafting harder.},
  eventtitle = {{{ASIACCS}}},
  booktitle = {Proceedings of the 2017 {{ACM Asia Conference}} on {{Computer}} and {{Communications Security}}},
  date = {2017-04-02},
  pages = {506-519},
  keywords = {Computer Science - Cryptography and Security,Computer Science - Machine Learning},
  author = {Papernot, Nicolas and McDaniel, Patrick and Goodfellow, Ian and Jha, Somesh and Celik, Z. Berkay and Swami, Ananthram},
  file = {/Users/fergalcotter/Zotero/storage/BMKHFRBW/Papernot et al. - 2016 - Practical Black-Box Attacks against Machine Learni.pdf;/Users/fergalcotter/Zotero/storage/CLW8LQRL/1602.html}
}

@article{petrou_image_2010-1,
  title = {Image Processing. {{The}} Fundamentals. {{With CD}}-{{ROM}}. 2nd Ed},
  doi = {10.1002/9781119994398},
  abstract = {Following the success of the first edition, this thoroughly updated second edition of Image Processing: The Fundamentals will ensure that it remains the ideal text for anyone seeking an introduction to the essential concepts of image processing. New material includes image processing and colour, sine and cosine transforms, Independent Component Analysis (ICA), phase congruency and the monogenic signal and several other new topics. These updates are combined with coverage of classic topics in image processing, such as orthogonal transforms and image enhancement, making this a truly comprehensive text on the subject. Key features: Presents material at two levels of difficulty: the main text addresses the fundamental concepts and presents a broad view of image processing, whilst more advanced material is interleaved in boxes throughout the text, providing further reference for those who wish to examine each technique in depth. Contains a large number of fully worked out examples. Focuses on an understanding of how image processing methods work in practice. Illustrates complex algorithms on a step-by-step basis, and lists not only the good practices but also identifies the pitfalls in each case. Uses a clear question and answer structure. Includes a CD containing the MATLAB® code of the various examples and algorithms presented in the book. There is also an accompanying website with slides available for download for instructors as a teaching resource. Image Processing: The Fundamentals, Second Edition is an ideal teaching resource for both undergraduate and postgraduate students. It will also be of value to researchers of various disciplines from medicine to mathematics with a professional interest in image processing.},
  journaltitle = {Image Processing: The Fundamentals: Second Edition},
  shortjournal = {Image Processing: The Fundamentals: Second Edition},
  date = {2010},
  author = {Petrou, Maria and Petrou, Costas},
  file = {/Users/fergalcotter/Zotero/storage/2ACSJH98/Petrou and Petrou - 2011 - Image processing. The fundamentals. With CD-ROM. 2.pdf}
}

@article{duchi_adaptive_2011-1,
  title = {Adaptive {{Subgradient Methods}} for {{Online Learning}} and {{Stochastic Optimization}}},
  volume = {12},
  issn = {1532-4435},
  url = {http://dl.acm.org/citation.cfm?id=1953048.2021068},
  abstract = {We present a new family of subgradient methods that dynamically incorporate knowledge of the geometry of the data observed in earlier iterations to perform more informative gradient-based learning. Metaphorically, the adaptation allows us to find needles in haystacks in the form of very predictive but rarely seen features. Our paradigm stems from recent advances in stochastic optimization and online learning which employ proximal functions to control the gradient steps of the algorithm. We describe and analyze an apparatus for adaptively modifying the proximal function, which significantly simplifies setting a learning rate and results in regret guarantees that are provably as good as the best proximal function that can be chosen in hindsight. We give several efficient algorithms for empirical risk minimization problems with common and important regularization functions and domain constraints. We experimentally study our theoretical analysis and show that adaptive subgradient methods outperform state-of-the-art, yet non-adaptive, subgradient algorithms.},
  journaltitle = {J. Mach. Learn. Res.},
  urldate = {2019-07-29},
  date = {2011-07},
  pages = {2121--2159},
  author = {Duchi, John and Hazan, Elad and Singer, Yoram},
  file = {/Users/fergalcotter/Zotero/storage/5AGXWGPE/Duchi et al. - 2011 - Adaptive Subgradient Methods for Online Learning a.pdf}
}

@article{yang_genomics_2013,
  langid = {english},
  title = {Genomics of {{Drug Sensitivity}} in {{Cancer}} ({{GDSC}}): A Resource for Therapeutic Biomarker Discovery in Cancer Cells.},
  volume = {41},
  issn = {0305-1048},
  url = {https://europepmc.org/articles/PMC3531057/},
  doi = {10.1093/nar/gks1111},
  shorttitle = {Genomics of {{Drug Sensitivity}} in {{Cancer}} ({{GDSC}})},
  abstract = {Alterations in cancer genomes strongly influence clinical responses to treatment and in many instances are potent biomarkers for response to drugs. The Genomics of Drug Sensitivity in Cancer (GDSC) database () is the largest public resource for information ...},
  issue = {Database issue},
  journaltitle = {Nucleic acids research},
  shortjournal = {Nucleic Acids Res},
  urldate = {2019-08-01},
  date = {2013-01},
  pages = {D955-61},
  author = {Yang, W. and Soares, J. and Greninger, P. and Edelman, E. J. and Lightfoot, H. and Forbes, S. and Bindal, N. and Beare, D. and Smith, J. A. and Thompson, I. R. and Ramaswamy, S. and Futreal, P. A. and Haber, D. A. and Stratton, M. R. and Benes, C. and McDermott, U. and Garnett, M. J.},
  file = {/Users/fergalcotter/Dropbox/Papers/Yang et al_2013_Genomics of Drug Sensitivity in Cancer (GDSC).pdf;/Users/fergalcotter/Zotero/storage/M3KZN7JV/PMC3531057\;jsessionid=5D34E333B688419F8BB541A3679408D7.html},
  eprinttype = {pmid},
  eprint = {23180760}
}

@article{tibshirani_regression_1996,
  title = {Regression {{Shrinkage}} and {{Selection}} via the {{Lasso}}},
  volume = {58},
  issn = {0035-9246},
  url = {https://www.jstor.org/stable/2346178},
  abstract = {We propose a new method for estimation in linear models. The `lasso' minimizes the residual sum of squares subject to the sum of the absolute value of the coefficients being less than a constant. Because of the nature of this constraint it tends to produce some coefficients that are exactly 0 and hence gives interpretable models. Our simulation studies suggest that the lasso enjoys some of the favourable properties of both subset selection and ridge regression. It produces interpretable models like subset selection and exhibits the stability of ridge regression. There is also an interesting relationship with recent work in adaptive function estimation by Donoho and Johnstone. The lasso idea is quite general and can be applied in a variety of statistical models: extensions to generalized regression models and tree-based models are briefly described.},
  number = {1},
  journaltitle = {Journal of the Royal Statistical Society. Series B (Methodological)},
  urldate = {2019-08-06},
  date = {1996},
  pages = {267-288},
  author = {Tibshirani, Robert},
  file = {/Users/fergalcotter/Dropbox/Papers/Tibshirani - 1996 - Regression Shrinkage and Selection via the Lasso.pdf}
}

@letter{boyd_subgradient_2003,
  langid = {english},
  title = {The {{Subgradient Method}}},
  url = {https://web.stanford.edu/class/ee392o/subgrad_method.pdf},
  type = {Notes for EE392o, Stanford University},
  abstract = {Notes for EE392o, Stanford University},
  type = {Letter},
  urldate = {2019-08-06},
  date = {2003},
  author = {Boyd, Stephen and Xiao, Lin and Mutapcic, Almir},
  file = {/Users/fergalcotter/Zotero/storage/TFGR7RYK/Shor - 1985 - The Subgradient Method.pdf},
  doi = {10.1007/978-3-642-82118-9_3}
}

@book{taubman_jpeg2000_2013,
  title = {{{JPEG2000 Image Compression Fundamentals}}, {{Standards}} and {{Practice}}},
  isbn = {978-1-4613-5245-7},
  abstract = {This is nothing less than a totally essential reference for engineers and researchers in any field of work that involves the use of compressed imagery. Beginning with a thorough and up-to-date overview of the fundamentals of image compression, the authors move on to provide a complete description of the JPEG2000 standard. They then devote space to the implementation and exploitation of that standard. The final section describes other key image compression systems. This work has specific applications for those involved in the development of software and hardware solutions for multimedia, internet, and medical imaging applications.},
  publisher = {{Springer Publishing Company, Incorporated}},
  date = {2013},
  author = {Taubman, David and Marcellin, Michael}
}

@article{russakovsky_imagenet_2015-1,
  langid = {english},
  title = {{{ImageNet Large Scale Visual Recognition Challenge}}},
  volume = {115},
  issn = {1573-1405},
  url = {https://doi.org/10.1007/s11263-015-0816-y},
  doi = {10.1007/s11263-015-0816-y},
  abstract = {The ImageNet Large Scale Visual Recognition Challenge is a benchmark in object category classification and detection on hundreds of object categories and millions of images. The challenge has been run annually from 2010 to present, attracting participation from more than fifty institutions. This paper describes the creation of this benchmark dataset and the advances in object recognition that have been possible as a result. We discuss the challenges of collecting large-scale ground truth annotation, highlight key breakthroughs in categorical object recognition, provide a detailed analysis of the current state of the field of large-scale image classification and object detection, and compare the state-of-the-art computer vision accuracy with human accuracy. We conclude with lessons learned in the 5 years of the challenge, and propose future directions and improvements.},
  number = {3},
  journaltitle = {International Journal of Computer Vision},
  shortjournal = {Int J Comput Vis},
  urldate = {2019-08-12},
  date = {2015-12-01},
  pages = {211-252},
  keywords = {Benchmark,Dataset,Large-scale,Object detection,Object recognition},
  author = {Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and Berg, Alexander C. and Fei-Fei, Li},
  file = {/Users/fergalcotter/Zotero/storage/MRDZC73V/Russakovsky et al. - 2015 - ImageNet Large Scale Visual Recognition Challenge.pdf}
}

@inproceedings{dalal_histograms_2005,
  title = {Histograms of Oriented Gradients for Human Detection},
  volume = {1},
  doi = {10.1109/CVPR.2005.177},
  abstract = {We study the question of feature sets for robust visual object recognition; adopting linear SVM based human detection as a test case. After reviewing existing edge and gradient based descriptors, we show experimentally that grids of histograms of oriented gradient (HOG) descriptors significantly outperform existing feature sets for human detection. We study the influence of each stage of the computation on performance, concluding that fine-scale gradients, fine orientation binning, relatively coarse spatial binning, and high-quality local contrast normalization in overlapping descriptor blocks are all important for good results. The new approach gives near-perfect separation on the original MIT pedestrian database, so we introduce a more challenging dataset containing over 1800 annotated human images with a large range of pose variations and backgrounds.},
  eventtitle = {2005 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}}'05)},
  booktitle = {2005 {{IEEE Computer Society Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}}'05)},
  date = {2005-06},
  pages = {886-893 vol. 1},
  keywords = {coarse spatial binning,contrast normalization,edge based descriptors,feature extraction,fine orientation binning,fine-scale gradients,gradient based descriptors,gradient methods,High performance computing,Histograms,histograms of oriented gradients,human detection,Humans,Image databases,Image edge detection,linear SVM,object detection,Object detection,object recognition,Object recognition,overlapping descriptor,pedestrian database,robust visual object recognition,Robustness,support vector machines,Support vector machines,Testing},
  author = {Dalal, N. and Triggs, B.},
  file = {/Users/fergalcotter/Zotero/storage/52JNVXSV/Dalal and Triggs - 2005 - Histograms of oriented gradients for human detecti.pdf;/Users/fergalcotter/Zotero/storage/JXXN3D9Y/1467360.html}
}

@article{tygert_mathematical_2016,
  langid = {english},
  title = {A {{Mathematical Motivation}} for {{Complex}}-{{Valued Convolutional Networks}}},
  volume = {28},
  issn = {1530-888X},
  doi = {10.1162/NECO_a_00824},
  abstract = {A complex-valued convolutional network (convnet) implements the repeated application of the following composition of three operations, recursively applying the composition to an input vector of nonnegative real numbers: (1) convolution with complex-valued vectors, followed by (2) taking the absolute value of every entry of the resulting vectors, followed by (3) local averaging. For processing real-valued random vectors, complex-valued convnets can be viewed as data-driven multiscale windowed power spectra, data-driven multiscale windowed absolute spectra, data-driven multiwavelet absolute values, or (in their most general configuration) data-driven nonlinear multiwavelet packets. Indeed, complex-valued convnets can calculate multiscale windowed spectra when the convnet filters are windowed complex-valued exponentials. Standard real-valued convnets, using rectified linear units (ReLUs), sigmoidal (e.g., logistic or tanh) nonlinearities, or max pooling, for example, do not obviously exhibit the same exact correspondence with data-driven wavelets (whereas for complex-valued convnets, the correspondence is much more than just a vague analogy). Courtesy of the exact correspondence, the remarkably rich and rigorous body of mathematical analysis for wavelets applies directly to (complex-valued) convnets.},
  number = {5},
  journaltitle = {Neural Computation},
  shortjournal = {Neural Comput},
  date = {2016-05},
  pages = {815-825},
  keywords = {Computer Science - Learning,Computer Science - Neural and Evolutionary Computing,Similar Work,Statistics - Machine Learning,Unread},
  author = {Tygert, Mark and Bruna, Joan and Chintala, Soumith and LeCun, Yann and Piantino, Serkan and Szlam, Arthur},
  file = {/Users/fergalcotter/Dropbox/Papers/Bruna et al_2015_A mathematical motivation for complex-valued convolutional networks.pdf;/Users/fergalcotter/Dropbox/Papers/Bruna et al_2015_A mathematical motivation for complex-valued convolutional networks.pdf;/Users/fergalcotter/Zotero/storage/MXT4ZG4T/Tygert et al. - 2016 - A Mathematical Motivation for Complex-Valued Convo.pdf;/Users/fergalcotter/Zotero/storage/62RU4AWA/1503.html}
}

@inproceedings{46644,
  title = {Don't Decay the Learning Rate, Increase the Batch Size},
  url = {https://openreview.net/pdf?id=B1Yy1BxCZ},
  date = {2018},
  author = {Smith, Sam and Kindermans, Pieter-jan and Ying, Chris and Le, Quoc V.}
}

@incollection{kingsbury_wavelet_1998,
  langid = {english},
  location = {{Boston, MA, USA}},
  title = {Wavelet {{Transforms}} in {{Image Processing}}},
  isbn = {978-1-4612-1768-8},
  url = {https://doi.org/10.1007/978-1-4612-1768-8_2},
  abstract = {This chapter is designed to be partly tutorial in nature and partly a summary of recent work by the authors in applying wavelets to various image processing problems. The tutorial part describes the filter-bank implementation of the discrete wavelet transform (DWT) and shows that most wavelets which permit perfect reconstruction are similar in shape and scale. We then discuss an important drawback of these wavelet transforms, which is that the distribution of energy between coefficients at different scales is very sensitive to shifts in the input data. We propose the Complex Wavelet Transform (CWT) as a solution to this problem and show how it may be applied in two dimensions. Finally we give brief details of applications of the CWT to motion estimation and image de-noising.},
  booktitle = {Signal {{Analysis}} and {{Prediction}}},
  series = {Applied and {{Numerical Harmonic Analysis}}},
  publisher = {{Birkhäuser Boston}},
  urldate = {2019-08-13},
  date = {1998},
  pages = {27-46},
  keywords = {complex wavelets,gabor filters,image de-noising,motion estimation,shift-invariance,Wavelets},
  author = {Kingsbury, Nick and Magarey, Julian},
  editor = {Procházka, Ales and Uhlíř, Jan and Rayner, P. W. J. and Kingsbury, N. G.},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_Magarey_1997_Wavelet transforms in image processing.pdf},
  doi = {10.1007/978-1-4612-1768-8_2}
}

@inproceedings{kingsbury_dual-tree_2000,
  location = {{Vancouver, BC}},
  title = {A Dual-Tree Complex Wavelet Transform with Improved Orthogonality and Symmetry Properties},
  volume = {2},
  doi = {10.1109/ICIP.2000.899397},
  abstract = {We present a new form of the dual-tree complex wavelet transform (DT CWT) with improved orthogonality and symmetry properties. Beyond level 1, the previous form used alternate odd-length and even-length bi-orthogonal filter pairs in the two halves of the dual-tree, whereas the new form employs a single design of even-length filter with asymmetric coefficients. These are similar to the Daubechies orthonormal filters, but designed with the additional constraint that the filter group delay should be approximately one quarter of the sample period. The filters in the two trees are just the time-reverse of each other, as are the analysis and reconstruction filters. This leads to a transform, which can use shorter filters, which is orthonormal beyond level 1, and in which the two trees are very closely matched and have a more symmetric sub-sampling structure, but which preserves the key DT CWT advantages of approximate shift-invariance and good directional selectivity in multiple dimensions.},
  eventtitle = {Proceedings 2000 {{International Conference}} on {{Image Processing}} ({{Cat}}. {{No}}.{{00CH37101}})},
  booktitle = {2000 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  publisher = {{IEEE}},
  date = {2000-09},
  pages = {375-378 vol.2},
  keywords = {analysis filter,approximate shift-invariance,asymmetric coefficients,Continuous wavelet transforms,Daubechies orthonormal filters,Delay,delays,directional selectivity,Discrete wavelet transforms,dual-tree complex wavelet transform,even-length bi-orthogonal filter,even-length filter design,filter group delay,filtering theory,FIR filters,Image analysis,Image reconstruction,linear phase filters,linear phase lowpass FIR filter,low-pass filters,Matched filters,multiple dimensions,Nonlinear filters,odd-length bi-orthogonal filter,orthogonality property,reconstruction filter,sample period,Sampling methods,Signal processing,signal reconstruction,signal sampling,symmetric sub-sampling structure,symmetry property,trees (mathematics),wavelet transforms,Wavelet transforms},
  author = {Kingsbury, N.},
  file = {/Users/fergalcotter/Dropbox/Papers/Kingsbury_2000_A dual-tree complex wavelet transform with improved orthogonality and symmetry.pdf;/Users/fergalcotter/Zotero/storage/D6V2VKR6/899397.html}
}

@article{iorio_landscape_2016,
  langid = {english},
  title = {A {{Landscape}} of {{Pharmacogenomic Interactions}} in {{Cancer}}},
  volume = {166},
  issn = {1097-4172},
  doi = {10.1016/j.cell.2016.06.017},
  abstract = {Systematic studies of cancer genomes have provided unprecedented insights into the molecular nature of cancer. Using this information to guide the development and application of therapies in the clinic is challenging. Here, we report how cancer-driven alterations identified in 11,289 tumors from 29 tissues (integrating somatic mutations, copy number alterations, DNA methylation, and gene expression) can be mapped onto 1,001 molecularly annotated human cancer cell lines and correlated with sensitivity to 265 drugs. We find that cell lines faithfully recapitulate oncogenic alterations identified in tumors, find that many of these associate with drug sensitivity/resistance, and highlight the importance of tissue lineage in mediating drug response. Logic-based modeling uncovers combinations of alterations that sensitize to drugs, while machine learning demonstrates the relative importance of different data types in predicting drug response. Our analysis and datasets are rich resources to link genotypes with cellular phenotypes and to identify therapeutic options for selected cancer sub-populations.},
  number = {3},
  journaltitle = {Cell},
  shortjournal = {Cell},
  date = {2016-07-28},
  pages = {740-754},
  keywords = {Analysis of Variance,Antineoplastic Agents,Cell Line; Tumor,DNA Methylation,Drug Resistance; Neoplasm,Gene Dosage,Humans,Models; Genetic,Mutation,Neoplasms,Oncogenes,Precision Medicine},
  author = {Iorio, Francesco and Knijnenburg, Theo A. and Vis, Daniel J. and Bignell, Graham R. and Menden, Michael P. and Schubert, Michael and Aben, Nanne and Gonçalves, Emanuel and Barthorpe, Syd and Lightfoot, Howard and Cokelaer, Thomas and Greninger, Patricia and van Dyk, Ewald and Chang, Han and de Silva, Heshani and Heyn, Holger and Deng, Xianming and Egan, Regina K. and Liu, Qingsong and Mironenko, Tatiana and Mitropoulos, Xeni and Richardson, Laura and Wang, Jinhua and Zhang, Tinghu and Moran, Sebastian and Sayols, Sergi and Soleimani, Maryam and Tamborero, David and Lopez-Bigas, Nuria and Ross-Macdonald, Petra and Esteller, Manel and Gray, Nathanael S. and Haber, Daniel A. and Stratton, Michael R. and Benes, Cyril H. and Wessels, Lodewyk F. A. and Saez-Rodriguez, Julio and McDermott, Ultan and Garnett, Mathew J.},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Zotero/storage/KUUIHASZ/Iorio et al. - 2016 - A Landscape of Pharmacogenomic Interactions in Can.pdf},
  eprinttype = {pmid},
  eprint = {27397505},
  pmcid = {PMC4967469}
}

@article{costello_community_2014,
  langid = {english},
  title = {A Community Effort to Assess and Improve Drug Sensitivity Prediction Algorithms},
  volume = {32},
  issn = {1546-1696},
  url = {https://www.nature.com/articles/nbt.2877},
  doi = {10.1038/nbt.2877},
  abstract = {Predicting the best treatment strategy from genomic information is a core goal of precision medicine. Here we focus on predicting drug response based on a cohort of genomic, epigenomic and proteomic profiling data sets measured in human breast cancer cell lines. Through a collaborative effort between the National Cancer Institute (NCI) and the Dialogue on Reverse Engineering Assessment and Methods (DREAM) project, we analyzed a total of 44 drug sensitivity prediction algorithms. The top-performing approaches modeled nonlinear relationships and incorporated biological pathway information. We found that gene expression microarrays consistently provided the best predictive power of the individual profiling data sets; however, performance was increased by including multiple, independent data sets. We discuss the innovations underlying the top-performing methodology, Bayesian multitask MKL, and we provide detailed descriptions of all methods. This study establishes benchmarks for drug sensitivity prediction and identifies approaches that can be leveraged for the development of new methods.},
  number = {12},
  journaltitle = {Nature Biotechnology},
  urldate = {2019-08-13},
  date = {2014-12},
  pages = {1202-1212},
  author = {Costello, James C. and Heiser, Laura M. and Georgii, Elisabeth and Gönen, Mehmet and Menden, Michael P. and Wang, Nicholas J. and Bansal, Mukesh and Ammad-ud-din, Muhammad and Hintsanen, Petteri and Khan, Suleiman A. and Mpindi, John-Patrick and Kallioniemi, Olli and Honkela, Antti and Aittokallio, Tero and Wennerberg, Krister and {Nci Dream Community} and Abbuehl, Jean-Paul and Aittokallio, Tero and Allen, Jeffrey and Altman, Russ B. and Ammad-ud-din, Muhammad and Balcome, Shawn and Bansal, Mukesh and Battle, Alexis and Bender, Andreas and Berger, Bonnie and Bernard, Jonathan and Bhattacharjee, Madhuchhanda and Bhuvaneshwar, Krithika and Bieberich, Andrew A. and Boehm, Fred and Califano, Andrea and Chan, Christina and Chen, Beibei and Chen, Ting-Huei and Choi, Jaejoon and Coelho, Luis Pedro and Cokelaer, Thomas and Collins, James C. and Costello, James C. and Creighton, Chad J. and Cui, Jike and Dampier, Will and Davisson, V. Jo and De Baets, Bernard and Deshpande, Raamesh and DiCamillo, Barbara and Dundar, Murat and Duren, Zhana and Ertel, Adam and Fan, Haoyang and Fang, Hongbin and Gallahan, Dan and Gauba, Robinder and Georgii, Elisabeth and Gönen, Mehmet and Gottlieb, Assaf and Grau, Michael and Gray, Joe W. and Gusev, Yuriy and Ha, Min Jin and Han, Leng and Harris, Michael and Heiser, Laura M. and Henderson, Nicholas and Hejase, Hussein A. and Hintsanen, Petteri and Homicsko, Krisztian and Honkela, Antti and Hou, Jack P. and Hwang, Woochang and IJzerman, Adriaan P. and Kallioniemi, Olli and Karacali, Bilge and Kaski, Samuel and Keles, Sunduz and Kendziorski, Christina and Khan, Suleiman A. and Kim, Junho and Kim, Min and Kim, Youngchul and Knowles, David A. and Koller, Daphne and Lee, Junehawk and Lee, Jae K. and Lenselink, Eelke B. and Li, Biao and Li, Bin and Li, Jun and Liang, Han and Ma, Jian and Madhavan, Subha and Menden, Michael P. and Mooney, Sean and Mpindi, John-Patrick and Myers, Chad L. and Newton, Michael A. and Overington, John P. and Pal, Ranadip and Peng, Jian and Pestell, Richard and Prill, Robert J. and Qiu, Peng and Rajwa, Bartek and Sadanandam, Anguraj and Saez-Rodriguez, Julio and Sambo, Francesco and Shin, Hyunjin and Singer, Dinah and Song, Jiuzhou and Song, Lei and Sridhar, Arvind and Stock, Michiel and Stolovitzky, Gustavo and Sun, Wei and Ta, Tram and Tadesse, Mahlet and Tan, Ming and Tang, Hao and Theodorescu, Dan and Toffolo, Gianna Maria and Tozeren, Aydin and Trepicchio, William and Varoquaux, Nelle and Vert, Jean-Philippe and Waegeman, Willem and Walter, Thomas and Wan, Qian and Wang, Difei and Wang, Nicholas J. and Wang, Wen and Wang, Yong and Wang, Zhishi and Wegner, Joerg K. and Wennerberg, Krister and Wu, Tongtong and Xia, Tian and Xiao, Guanghua and Xie, Yang and Xu, Yanxun and Yang, Jichen and Yuan, Yuan and Zhang, Shihua and Zhang, Xiang-Sun and Zhao, Junfei and Zuo, Chandler and van Vlijmen, Herman W. T. and van Westen, Gerard J. P. and Collins, James J. and Gallahan, Dan and Singer, Dinah and Saez-Rodriguez, Julio and Kaski, Samuel and Gray, Joe W. and Stolovitzky, Gustavo},
  options = {useprefix=true},
  file = {/Users/fergalcotter/Zotero/storage/ZLE7AQY3/Costello et al. - 2014 - A community effort to assess and improve drug sens.pdf;/Users/fergalcotter/Zotero/storage/HNMVGJUY/nbt.html}
}

@article{mishkin_systematic_2017,
  title = {Systematic {{Evaluation}} of {{Convolution Neural Network Advances}} on the {{Imagenet}}},
  volume = {161},
  issn = {1077-3142},
  url = {https://doi.org/10.1016/j.cviu.2017.05.007},
  doi = {10.1016/j.cviu.2017.05.007},
  abstract = {The paper systematically studies the impact of a range of recent advances in convolution neural network(CNN) architectures and learning methods on the object categorization (ILSVRC) problem. The evaluation tests the influence of the following choices of the architecture: non-linearity (ReLU, ELU, maxout, compatability with batch normalization), pooling variants (stochastic, max, average, mixed), network width, classifier design (convolutional, fully-connected, SPP), image pre-processing, and of learning parameters: learning rate, batch size, cleanliness of the data, etc.The performance gains of the proposed modifications are first tested individually and then in combination. The sum of individual gains is greater than the observed improvement when all modifications are introduced, but the deficit is small suggesting independence of their benefits.We show that the use of 128 128 pixel images is sufficient to make qualitative conclusions about optimal network structure that hold for the full size Caffe and VGG nets. The results are obtained an order of magnitude faster than with the standard 224pixel images.},
  number = {C},
  journaltitle = {Comput. Vis. Image Underst.},
  urldate = {2019-08-14},
  date = {2017-08},
  pages = {11--19},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Computer Science - Neural and Evolutionary Computing},
  author = {Mishkin, Dmytro and Sergievskiy, Nikolay and Matas, Jiri},
  file = {/Users/fergalcotter/Dropbox/Papers/Mishkin et al_2016_Systematic evaluation of CNN advances on the ImageNet.pdf;/Users/fergalcotter/Zotero/storage/XFE6FJH5/2017 - Systematic Evaluation of Convolution Neural Networ.pdf;/Users/fergalcotter/Zotero/storage/HQVFDW22/1606.html}
}

@inproceedings{ioffe_batch_2015,
  langid = {english},
  location = {{Lille, France}},
  title = {Batch {{Normalization}}: {{Accelerating Deep Network Training}} by {{Reducing Internal Covariate Shift}}},
  url = {http://proceedings.mlr.press/v37/ioffe15.html},
  shorttitle = {Batch {{Normalization}}},
  abstract = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer’s inputs changes during training, as the parameters of the previous layers change. This slows down the t...},
  eventtitle = {International {{Conference}} on {{Machine Learning}}},
  booktitle = {International {{Conference}} on {{Machine Learning}}},
  publisher = {{JMLR}},
  urldate = {2019-08-14},
  date = {2015-07},
  pages = {448-456},
  keywords = {Computer Science - Learning,Unread},
  author = {Ioffe, Sergey and Szegedy, Christian},
  file = {/Users/fergalcotter/Dropbox/Papers/Ioffe_Szegedy_2015_Batch Normalization.pdf;/Users/fergalcotter/Zotero/storage/F5SXLFVC/Ioffe and Szegedy - 2015 - Batch Normalization Accelerating Deep Network Tra.pdf;/Users/fergalcotter/Zotero/storage/GKWSH23K/1502.html;/Users/fergalcotter/Zotero/storage/PP4GVYAJ/ioffe15.html}
}

@inproceedings{bartlett_gradient_2018,
  langid = {english},
  title = {Gradient Descent with Identity Initialization Efficiently Learns Positive Definite Linear Transformations by Deep Residual Networks},
  url = {http://proceedings.mlr.press/v80/bartlett18a.html},
  abstract = {We analyze algorithms for approximating a function \$f(x) = \textbackslash{}Phi x\$ mapping \$\textbackslash{}Re\^d\$ to \$\textbackslash{}Re\^d\$ using deep linear neural networks, i.e. that learn a function \$h\$ parameterized by matrices \$\textbackslash{}Theta\_1,....},
  eventtitle = {International {{Conference}} on {{Machine Learning}}},
  booktitle = {International {{Conference}} on {{Machine Learning}}},
  urldate = {2019-08-14},
  date = {2018-07-03},
  pages = {521-530},
  keywords = {Computer Science - Neural and Evolutionary Computing,Mathematics - Optimization and Control,Statistics - Machine Learning,Computer Science - Machine Learning,Mathematics - Statistics Theory},
  author = {Bartlett, Peter and Helmbold, Dave and Long, Philip},
  file = {/Users/fergalcotter/Dropbox/Papers/Bartlett et al_2018_Gradient descent with identity initialization efficiently learns positive.pdf;/Users/fergalcotter/Zotero/storage/ZXD93HMV/Bartlett et al. - 2018 - Gradient descent with identity initialization effi.pdf;/Users/fergalcotter/Zotero/storage/6QY5WLN7/1802.html;/Users/fergalcotter/Zotero/storage/9TJ7GGBQ/bartlett18a.html}
}

@inproceedings{he_identity_2016,
  langid = {english},
  title = {Identity {{Mappings}} in {{Deep Residual Networks}}},
  isbn = {978-3-319-46493-0},
  abstract = {Deep residual networks have emerged as a family of extremely deep architectures showing compelling accuracy and nice convergence behaviors. In this paper, we analyze the propagation formulations behind the residual building blocks, which suggest that the forward and backward signals can be directly propagated from one block to any other block, when using identity mappings as the skip connections and after-addition activation. A series of ablation experiments support the importance of these identity mappings. This motivates us to propose a new residual unit, which makes training easier and improves generalization. We report improved results using a 1001-layer ResNet on CIFAR-10 (4.62 \% error) and CIFAR-100, and a 200-layer ResNet on ImageNet. Code is available at: https://github.com/KaimingHe/resnet-1k-layers.},
  booktitle = {Computer {{Vision}} – {{ECCV}} 2016},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  publisher = {{Springer International Publishing}},
  date = {2016},
  pages = {630-645},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Grey Arrow,Identity Mapping,Residual Function,Residual Unit,Training Error},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  editor = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max},
  file = {/Users/fergalcotter/Dropbox/Papers/He et al_2016_Identity Mappings in Deep Residual Networks.pdf;/Users/fergalcotter/Zotero/storage/5V5HMV93/1603.html}
}

@article{papyan_convolutional_2017-1,
  title = {Convolutional {{Neural Networks Analyzed}} via {{Convolutional Sparse Coding}}},
  volume = {18},
  issn = {1533-7928},
  url = {http://jmlr.org/papers/v18/16-505.html},
  number = {83},
  journaltitle = {Journal of Machine Learning Research},
  urldate = {2019-08-14},
  date = {2017},
  pages = {1-52},
  keywords = {Computer Science - Learning,Statistics - Machine Learning},
  author = {Papyan, Vardan and Romano, Yaniv and Elad, Michael},
  file = {/Users/fergalcotter/Dropbox/Papers/Papyan et al_2016_Convolutional Neural Networks Analyzed via Convolutional Sparse Coding.pdf;/Users/fergalcotter/Zotero/storage/5ARGJAPF/Papyan et al. - 2017 - Convolutional Neural Networks Analyzed via Convolu.pdf;/Users/fergalcotter/Zotero/storage/B3T6WVHP/1607.html;/Users/fergalcotter/Zotero/storage/SH5WJXUD/16-505.html}
}

@inproceedings{haber_learning_2017,
  location = {{San Francisco, CA, USA}},
  title = {Learning {{Across Scales}} - {{Multiscale Methods}} for {{Convolution Neural Networks}}},
  abstract = {In this work we establish the relation between optimal control and training deep Convolution Neural Networks (CNNs). We show that the forward propagation in CNNs can be interpreted as a time-dependent nonlinear differential equation and learning as controlling the parameters of the differential equation such that the network approximates the data-label relation for given training data. Using this continuous interpretation we derive two new methods to scale CNNs with respect to two different dimensions. The first class of multiscale methods connects low-resolution and high-resolution data through prolongation and restriction of CNN parameters. We demonstrate that this enables classifying high-resolution images using CNNs trained with low-resolution images and vice versa and warm-starting the learning process. The second class of multiscale methods connects shallow and deep networks and leads to new training strategies that gradually increase the depths of the CNN while re-using parameters for initializations.},
  eventtitle = {Conference on {{Artificial Intelligence}} ({{AAAI}})},
  booktitle = {31st {{Conference}} on {{Artificial Intelligence}} ({{AAAI}})},
  date = {2017-02},
  keywords = {Algorithm,British undergraduate degree classification,Coefficient,Computation,Computational resource,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Neural and Evolutionary Computing,Convolution,Convolutional neural network,Deep learning,Discretization,Estimation theory,First-class function,Image processing,Image resolution,Image scaling,Interpolation,Iteration,Linear algebra,Multigrid method,Neural network software,Neural Networks,Nonlinear system,Numerical partial differential equations,Optimal control,Pyramid (image processing),Software propagation},
  author = {Haber, Eldad and Ruthotto, Lars and Holtham, Elliot},
  file = {/Users/fergalcotter/Dropbox/Papers/Haber et al_2017_Learning across scales - A multiscale method for Convolution Neural Networks.pdf;/Users/fergalcotter/Zotero/storage/A3FPW5MU/Haber et al. - 2017 - Learning Across Scales - Multiscale Methods for Co.pdf;/Users/fergalcotter/Zotero/storage/N4F8B873/1703.html}
}

@inproceedings{cisse_parseval_2017,
  title = {Parseval {{Networks}}: {{Improving Robustness}} to {{Adversarial Examples}}},
  url = {http://dl.acm.org/citation.cfm?id=3305381.3305470},
  shorttitle = {Parseval {{Networks}}},
  abstract = {We introduce Parseval networks, a form of deep neural networks in which the Lipschitz constant of linear, convolutional and aggregation layers is constrained to be smaller than 1. Parseval networks are empirically and theoretically motivated by an analysis of the robustness of the predictions made by deep neural networks when their input is subject to an adversarial perturbation. The most important feature of Parseval networks is to maintain weight matrices of linear and convolutional layers to be (approximately) Parseval tight frames, which are extensions of orthogonal matrices to non-square matrices. We describe how these constraints can be maintained efficiently during SGD. We show that Parseval networks match the state-of-the-art in terms of accuracy on CIFAR-10/100 and Street View House Numbers (SVHN), while being more robust than their vanilla counterpart against adversarial examples. Incidentally, Parseval networks also tend to train faster and make a better usage of the full capacity of the networks.},
  eventtitle = {International {{Conference}} on {{Machine Learning}} ({{ICML}})},
  booktitle = {Proceedings of the 34th {{International Conference}} on {{Machine Learning}}},
  series = {{{ICML}}'17},
  publisher = {{JMLR}},
  urldate = {2019-08-14},
  date = {2017},
  pages = {854--863},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Cryptography and Security,Computer Science - Learning,Statistics - Machine Learning},
  author = {Cisse, Moustapha and Bojanowski, Piotr and Grave, Edouard and Dauphin, Yann and Usunier, Nicolas},
  file = {/Users/fergalcotter/Dropbox/Papers/Cisse et al_2017_Parseval Networks.pdf;/Users/fergalcotter/Zotero/storage/E9ZVEJIS/1704.html},
  venue = {Sydney, NSW, Australia}
}

@inproceedings{liu_online_2017,
  title = {Online Convolutional Dictionary Learning},
  doi = {10.1109/ICIP.2017.8296573},
  abstract = {While a number of different algorithms have recently been proposed for convolutional dictionary learning, this remains an expensive problem. The single biggest impediment to learning from large training sets is the memory requirements, which grow at least linearly with the size of the training set since all existing methods are batch algorithms. The work reported here addresses this limitation by extending online dictionary learning ideas to the convolutional context.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  booktitle = {2017 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
  date = {2017-09},
  pages = {1707-1711},
  keywords = {ADMM,batch algorithms,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,convolution,Convolution,Convolutional codes,convolutional context,Convolutional Dictionary Learning,Convolutional Sparse Representation,Dictionaries,Electrical Engineering and Systems Science - Image and Video Processing,Frequency-domain analysis,Machine learning,memory requirements,online convolutional dictionary learning,online dictionary learning,signal processing,Training,Training data,training sets},
  author = {Liu, J. and Garcia-Cardona, C. and Wohlberg, B. and Yin, W.},
  file = {/Users/fergalcotter/Zotero/storage/66MU8H2Z/Liu et al. - 2017 - Online Convolutional Dictionary Learning.pdf;/Users/fergalcotter/Zotero/storage/74GWLRT5/Liu et al. - 2017 - Online convolutional dictionary learning.pdf;/Users/fergalcotter/Zotero/storage/FZTYD523/8296573.html;/Users/fergalcotter/Zotero/storage/JBZ9UWVF/1706.html}
}

@article{liu_first_2018,
  title = {First and {{Second}}-{{Order Methods}} for {{Online Convolutional Dictionary Learning}}},
  volume = {11},
  url = {https://epubs.siam.org/doi/abs/10.1137/17M1145689},
  doi = {10.1137/17M1145689},
  abstract = {Convolutional sparse representations are a form of sparse representation with a structured, translation-invariant dictionary.  Most convolutional dictionary learning algorithms to date operate in batch mode, requiring simultaneous access to all training images during the learning process, which results in very high memory usage and severely limits the training data size that can be used.  Very recently, however, a number of authors have considered the design of online convolutional dictionary learning algorithms that offer far better scaling of memory and computational cost with training set size than batch methods.  This paper extends our prior work, improving a number of aspects of our previous algorithm; proposing an entirely new one, with better performance, that supports the inclusion of a spatial mask for learning from incomplete data; and providing a rigorous theoretical analysis of these methods.},
  number = {2},
  journaltitle = {SIAM Journal on Imaging Sciences},
  shortjournal = {SIAM J. Imaging Sci.},
  urldate = {2019-08-14},
  date = {2018-01-01},
  pages = {1589-1628},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Learning,Electrical Engineering and Systems Science - Image and Video Processing,Mathematics - Optimization and Control,Statistics - Machine Learning},
  author = {Liu, J. and Garcia-Cardona, C. and Wohlberg, B. and Yin, W.},
  file = {/Users/fergalcotter/Zotero/storage/9Z23RP38/Liu et al. - 2017 - First and Second Order Methods for Online Convolut.pdf;/Users/fergalcotter/Zotero/storage/SKJCZUVE/Liu et al. - 2018 - First- and Second-Order Methods for Online Convolu.pdf;/Users/fergalcotter/Zotero/storage/2WBCEIRQ/1709.html;/Users/fergalcotter/Zotero/storage/9CAA8SIQ/17M1145689.html}
}

@inproceedings{papyan_convolutional_2017,
  location = {{Venice, Italy}},
  title = {Convolutional {{Dictionary Learning}} via {{Local Processing}}},
  doi = {10.1109/iccv.2017.566},
  abstract = {Convolutional sparse coding is an increasingly popular model in the signal and image processing communities, tackling some of the limitations of traditional patch-based sparse representations. Although several works have addressed the dictionary learning problem under this model, these relied on an ADMM formulation in the Fourier domain, losing the sense of locality and the relation to the traditional patch-based sparse pursuit. A recent work suggested a novel theoretical analysis of this global model, providing guarantees that rely on a localized sparsity measure. Herein, we extend this local-global relation by showing how one can efficiently solve the convolutional sparse pursuit problem and train the filters involved, while operating locally on image patches. Our approach provides an intuitive algorithm that can leverage standard techniques from the sparse representations field. The proposed method is fast to train, simple to implement, and flexible enough that it can be easily deployed in a variety of applications. We demonstrate the proposed training scheme for image inpainting and image separation, achieving state-of-the-art results.},
  eventtitle = {2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  booktitle = {Proceedings of 2017 {{IEEE International Conference}} on {{Computer Vision}} ({{ICCV}})},
  date = {2017-10},
  pages = {5306-5314},
  keywords = {Algorithm,Computer Science - Computer Vision and Pattern Recognition,Convolutional neural network,Dictionary,Entity,Image processing,Inpainting,Locality of reference,Machine learning,Neural coding,Programming paradigm,Sparse matrix,Useful},
  author = {Papyan, Vardan and Romano, Yaniv and Elad, Michael and Sulam, Jeremias},
  file = {/Users/fergalcotter/Zotero/storage/2Q6M2UWM/Papyan et al. - 2017 - Convolutional Dictionary Learning via Local Proces.pdf;/Users/fergalcotter/Zotero/storage/J6JYBW3J/Papyan et al. - 2017 - Convolutional Dictionary Learning via Local Proces.pdf;/Users/fergalcotter/Zotero/storage/TYLZ8L8E/1705.html}
}

@preamble{ "\ifdefined\DeclarePrefChars\DeclarePrefChars{'’-}\else\fi " }