1975 lines
76 KiB
HTML
1975 lines
76 KiB
HTML
<h1 id="awesome-computer-vision-models-awesome">Awesome Computer Vision
|
||
Models <a href="https://awesome.re"><img
|
||
src="https://awesome.re/badge-flat.svg" alt="Awesome" /></a></h1>
|
||
<p>A curated list of popular classification, segmentation and detection
|
||
models with corresponding evaluation metrics from papers.</p>
|
||
<h2 id="contents">Contents</h2>
|
||
<ul>
|
||
<li><a href="#classification-models">Classification models</a></li>
|
||
<li><a href="#segmentation-models">Segmentation models</a></li>
|
||
<li><a href="#detection-models">Detection models</a></li>
|
||
</ul>
|
||
<h2 id="classification-models">Classification models</h2>
|
||
<table>
|
||
<colgroup>
|
||
<col style="width: 75%" />
|
||
<col style="width: 6%" />
|
||
<col style="width: 5%" />
|
||
<col style="width: 5%" />
|
||
<col style="width: 4%" />
|
||
<col style="width: 2%" />
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th style="text-align: center;">Model</th>
|
||
<th style="text-align: center;">Number of parameters</th>
|
||
<th style="text-align: center;">FLOPS</th>
|
||
<th style="text-align: center;">Top-1 Error</th>
|
||
<th style="text-align: center;">Top-5 Error</th>
|
||
<th style="text-align: center;">Year</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">AlexNet (<a
|
||
href="https://arxiv.org/abs/1404.5997">‘One weird trick for
|
||
parallelizing convolutional neural networks’</a>)</td>
|
||
<td style="text-align: center;">62.3M</td>
|
||
<td style="text-align: center;">1,132.33M</td>
|
||
<td style="text-align: center;">40.96</td>
|
||
<td style="text-align: center;">18.24</td>
|
||
<td style="text-align: center;">2014</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">VGG-16 (<a
|
||
href="https://arxiv.org/abs/1409.1556">‘Very Deep Convolutional Networks
|
||
for Large-Scale Image Recognition’</a>)</td>
|
||
<td style="text-align: center;">138.3M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">26.78</td>
|
||
<td style="text-align: center;">8.69</td>
|
||
<td style="text-align: center;">2014</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ResNet-10 (<a
|
||
href="https://arxiv.org/abs/1512.03385">‘Deep Residual Learning for
|
||
Image Recognition’</a>)</td>
|
||
<td style="text-align: center;">5.5M</td>
|
||
<td style="text-align: center;">894.04M</td>
|
||
<td style="text-align: center;">34.69</td>
|
||
<td style="text-align: center;">14.36</td>
|
||
<td style="text-align: center;">2015</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ResNet-18 (<a
|
||
href="https://arxiv.org/abs/1512.03385">‘Deep Residual Learning for
|
||
Image Recognition’</a>)</td>
|
||
<td style="text-align: center;">11.7M</td>
|
||
<td style="text-align: center;">1,820.41M</td>
|
||
<td style="text-align: center;">28.53</td>
|
||
<td style="text-align: center;">9.82</td>
|
||
<td style="text-align: center;">2015</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ResNet-34 (<a
|
||
href="https://arxiv.org/abs/1512.03385">‘Deep Residual Learning for
|
||
Image Recognition’</a>)</td>
|
||
<td style="text-align: center;">21.8M</td>
|
||
<td style="text-align: center;">3,672.68M</td>
|
||
<td style="text-align: center;">24.84</td>
|
||
<td style="text-align: center;">7.80</td>
|
||
<td style="text-align: center;">2015</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ResNet-50 (<a
|
||
href="https://arxiv.org/abs/1512.03385">‘Deep Residual Learning for
|
||
Image Recognition’</a>)</td>
|
||
<td style="text-align: center;">25.5M</td>
|
||
<td style="text-align: center;">3,877.95M</td>
|
||
<td style="text-align: center;">22.28</td>
|
||
<td style="text-align: center;">6.33</td>
|
||
<td style="text-align: center;">2015</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">InceptionV3 (<a
|
||
href="https://arxiv.org/abs/1512.00567">‘Rethinking the Inception
|
||
Architecture for Computer Vision’</a>)</td>
|
||
<td style="text-align: center;">23.8M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">21.2</td>
|
||
<td style="text-align: center;">5.6</td>
|
||
<td style="text-align: center;">2015</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">PreResNet-18 (<a
|
||
href="https://arxiv.org/abs/1603.05027">‘Identity Mappings in Deep
|
||
Residual Networks’</a>)</td>
|
||
<td style="text-align: center;">11.7M</td>
|
||
<td style="text-align: center;">1,820.56M</td>
|
||
<td style="text-align: center;">28.43</td>
|
||
<td style="text-align: center;">9.72</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">PreResNet-34 (<a
|
||
href="https://arxiv.org/abs/1603.05027">‘Identity Mappings in Deep
|
||
Residual Networks’</a>)</td>
|
||
<td style="text-align: center;">21.8M</td>
|
||
<td style="text-align: center;">3,672.83M</td>
|
||
<td style="text-align: center;">24.89</td>
|
||
<td style="text-align: center;">7.74</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">PreResNet-50 (<a
|
||
href="https://arxiv.org/abs/1603.05027">‘Identity Mappings in Deep
|
||
Residual Networks’</a>)</td>
|
||
<td style="text-align: center;">25.6M</td>
|
||
<td style="text-align: center;">3,875.44M</td>
|
||
<td style="text-align: center;">22.40</td>
|
||
<td style="text-align: center;">6.47</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DenseNet-121 (<a
|
||
href="https://arxiv.org/abs/1608.06993">‘Densely Connected Convolutional
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">8.0M</td>
|
||
<td style="text-align: center;">2,872.13M</td>
|
||
<td style="text-align: center;">23.48</td>
|
||
<td style="text-align: center;">7.04</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DenseNet-161 (<a
|
||
href="https://arxiv.org/abs/1608.06993">‘Densely Connected Convolutional
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">28.7M</td>
|
||
<td style="text-align: center;">7,793.16M</td>
|
||
<td style="text-align: center;">22.86</td>
|
||
<td style="text-align: center;">6.44</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">PyramidNet-101 (<a
|
||
href="https://arxiv.org/abs/1610.02915">‘Deep Pyramidal Residual
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">42.5M</td>
|
||
<td style="text-align: center;">8,743.54M</td>
|
||
<td style="text-align: center;">21.98</td>
|
||
<td style="text-align: center;">6.20</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ResNeXt-14(32x4d) (<a
|
||
href="http://arxiv.org/abs/1611.05431">‘Aggregated Residual
|
||
Transformations for Deep Neural Networks’</a>)</td>
|
||
<td style="text-align: center;">9.5M</td>
|
||
<td style="text-align: center;">1,603.46M</td>
|
||
<td style="text-align: center;">30.32</td>
|
||
<td style="text-align: center;">11.46</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ResNeXt-26(32x4d) (<a
|
||
href="http://arxiv.org/abs/1611.05431">‘Aggregated Residual
|
||
Transformations for Deep Neural Networks’</a>)</td>
|
||
<td style="text-align: center;">15.4M</td>
|
||
<td style="text-align: center;">2,488.07M</td>
|
||
<td style="text-align: center;">24.14</td>
|
||
<td style="text-align: center;">7.46</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">WRN-50-2 (<a
|
||
href="https://arxiv.org/abs/1605.07146">‘Wide Residual
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">68.9M</td>
|
||
<td style="text-align: center;">11,405.42M</td>
|
||
<td style="text-align: center;">22.53</td>
|
||
<td style="text-align: center;">6.41</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">Xception (<a
|
||
href="https://arxiv.org/abs/1610.02357">‘Xception: Deep Learning with
|
||
Depthwise Separable Convolutions’</a>)</td>
|
||
<td style="text-align: center;">22,855,952</td>
|
||
<td style="text-align: center;">8,403.63M</td>
|
||
<td style="text-align: center;">20.97</td>
|
||
<td style="text-align: center;">5.49</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">InceptionV4 (<a
|
||
href="https://arxiv.org/abs/1602.07261">‘Inception-v4, Inception-ResNet
|
||
and the Impact of Residual Connections on Learning’</a>)</td>
|
||
<td style="text-align: center;">42,679,816</td>
|
||
<td style="text-align: center;">12,304.93M</td>
|
||
<td style="text-align: center;">20.64</td>
|
||
<td style="text-align: center;">5.29</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">InceptionResNetV2 (<a
|
||
href="https://arxiv.org/abs/1602.07261">‘Inception-v4, Inception-ResNet
|
||
and the Impact of Residual Connections on Learning’</a>)</td>
|
||
<td style="text-align: center;">55,843,464</td>
|
||
<td style="text-align: center;">13,188.64M</td>
|
||
<td style="text-align: center;">19.93</td>
|
||
<td style="text-align: center;">4.90</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">PolyNet (<a
|
||
href="https://arxiv.org/abs/1611.05725">‘PolyNet: A Pursuit of
|
||
Structural Diversity in Very Deep Networks’</a>)</td>
|
||
<td style="text-align: center;">95,366,600</td>
|
||
<td style="text-align: center;">34,821.34M</td>
|
||
<td style="text-align: center;">19.10</td>
|
||
<td style="text-align: center;">4.52</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DarkNet Ref (<a
|
||
href="https://github.com/pjreddie/darknet">‘Darknet: Open source neural
|
||
networks in C’</a>)</td>
|
||
<td style="text-align: center;">7,319,416</td>
|
||
<td style="text-align: center;">367.59M</td>
|
||
<td style="text-align: center;">38.58</td>
|
||
<td style="text-align: center;">17.18</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DarkNet Tiny (<a
|
||
href="https://github.com/pjreddie/darknet">‘Darknet: Open source neural
|
||
networks in C’</a>)</td>
|
||
<td style="text-align: center;">1,042,104</td>
|
||
<td style="text-align: center;">500.85M</td>
|
||
<td style="text-align: center;">40.74</td>
|
||
<td style="text-align: center;">17.84</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DarkNet 53 (<a
|
||
href="https://github.com/pjreddie/darknet">‘Darknet: Open source neural
|
||
networks in C’</a>)</td>
|
||
<td style="text-align: center;">41,609,928</td>
|
||
<td style="text-align: center;">7,133.86M</td>
|
||
<td style="text-align: center;">21.75</td>
|
||
<td style="text-align: center;">5.64</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">SqueezeResNet1.1 (<a
|
||
href="https://arxiv.org/abs/1602.07360">‘SqueezeNet: AlexNet-level
|
||
accuracy with 50x fewer parameters and <0.5MB model size’</a>)</td>
|
||
<td style="text-align: center;">1,235,496</td>
|
||
<td style="text-align: center;">352.02M</td>
|
||
<td style="text-align: center;">40.09</td>
|
||
<td style="text-align: center;">18.21</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">SqueezeNet1.1 (<a
|
||
href="https://arxiv.org/abs/1602.07360">‘SqueezeNet: AlexNet-level
|
||
accuracy with 50x fewer parameters and <0.5MB model size’</a>)</td>
|
||
<td style="text-align: center;">1,235,496</td>
|
||
<td style="text-align: center;">352.02M</td>
|
||
<td style="text-align: center;">39.31</td>
|
||
<td style="text-align: center;">17.72</td>
|
||
<td style="text-align: center;">2016</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ResAttNet-92 (<a
|
||
href="https://arxiv.org/abs/1704.06904">‘Residual Attention Network for
|
||
Image Classification’</a>)</td>
|
||
<td style="text-align: center;">51.3M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">19.5</td>
|
||
<td style="text-align: center;">4.8</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">CondenseNet (G=C=8) (<a
|
||
href="https://arxiv.org/abs/1711.09224">‘CondenseNet: An Efficient
|
||
DenseNet using Learned Group Convolutions’</a>)</td>
|
||
<td style="text-align: center;">4.8M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">26.2</td>
|
||
<td style="text-align: center;">8.3</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DPN-68 (<a
|
||
href="https://arxiv.org/abs/1707.01629">‘Dual Path Networks’</a>)</td>
|
||
<td style="text-align: center;">12,611,602</td>
|
||
<td style="text-align: center;">2,351.84M</td>
|
||
<td style="text-align: center;">23.24</td>
|
||
<td style="text-align: center;">6.79</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ShuffleNet x1.0 (g=1) (<a
|
||
href="https://arxiv.org/abs/1707.01083">‘ShuffleNet: An Extremely
|
||
Efficient Convolutional Neural Network for Mobile Devices’</a>)</td>
|
||
<td style="text-align: center;">1,531,936</td>
|
||
<td style="text-align: center;">148.13M</td>
|
||
<td style="text-align: center;">34.93</td>
|
||
<td style="text-align: center;">13.89</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DiracNetV2-18 (<a
|
||
href="https://arxiv.org/abs/1706.00388">‘DiracNets: Training Very Deep
|
||
Neural Networks Without Skip-Connections’</a>)</td>
|
||
<td style="text-align: center;">11,511,784</td>
|
||
<td style="text-align: center;">1,796.62M</td>
|
||
<td style="text-align: center;">31.47</td>
|
||
<td style="text-align: center;">11.70</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DiracNetV2-34 (<a
|
||
href="https://arxiv.org/abs/1706.00388">‘DiracNets: Training Very Deep
|
||
Neural Networks Without Skip-Connections’</a>)</td>
|
||
<td style="text-align: center;">21,616,232</td>
|
||
<td style="text-align: center;">3,646.93M</td>
|
||
<td style="text-align: center;">28.75</td>
|
||
<td style="text-align: center;">9.93</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">SENet-16 (<a
|
||
href="https://arxiv.org/abs/1709.01507">‘Squeeze-and-Excitation
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">31,366,168</td>
|
||
<td style="text-align: center;">5,081.30M</td>
|
||
<td style="text-align: center;">25.65</td>
|
||
<td style="text-align: center;">8.20</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">SENet-154 (<a
|
||
href="https://arxiv.org/abs/1709.01507">‘Squeeze-and-Excitation
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">115,088,984</td>
|
||
<td style="text-align: center;">20,745.78M</td>
|
||
<td style="text-align: center;">18.62</td>
|
||
<td style="text-align: center;">4.61</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">MobileNet (<a
|
||
href="https://arxiv.org/abs/1704.04861">‘MobileNets: Efficient
|
||
Convolutional Neural Networks for Mobile Vision Applications’</a>)</td>
|
||
<td style="text-align: center;">4,231,976</td>
|
||
<td style="text-align: center;">579.80M</td>
|
||
<td style="text-align: center;">26.61</td>
|
||
<td style="text-align: center;">8.95</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">NASNet-A 4@1056 (<a
|
||
href="https://arxiv.org/abs/1707.07012">‘Learning Transferable
|
||
Architectures for Scalable Image Recognition’</a>)</td>
|
||
<td style="text-align: center;">5,289,978</td>
|
||
<td style="text-align: center;">584.90M</td>
|
||
<td style="text-align: center;">25.68</td>
|
||
<td style="text-align: center;">8.16</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">NASNet-A 6@4032(<a
|
||
href="https://arxiv.org/abs/1707.07012">‘Learning Transferable
|
||
Architectures for Scalable Image Recognition’</a>)</td>
|
||
<td style="text-align: center;">88,753,150</td>
|
||
<td style="text-align: center;">23,976.44M</td>
|
||
<td style="text-align: center;">18.14</td>
|
||
<td style="text-align: center;">4.21</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DLA-34 (<a
|
||
href="https://arxiv.org/abs/1707.06484">‘Deep Layer
|
||
Aggregation’</a>)</td>
|
||
<td style="text-align: center;">15,742,104</td>
|
||
<td style="text-align: center;">3,071.37M</td>
|
||
<td style="text-align: center;">25.36</td>
|
||
<td style="text-align: center;">7.94</td>
|
||
<td style="text-align: center;">2017</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">AirNet50-1x64d (r=2) (<a
|
||
href="https://ieeexplore.ieee.org/document/8510896">‘Attention Inspiring
|
||
Receptive-Fields Network for Learning Invariant
|
||
Representations’</a>)</td>
|
||
<td style="text-align: center;">27.43M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">22.48</td>
|
||
<td style="text-align: center;">6.21</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">BAM-ResNet-50 (<a
|
||
href="https://arxiv.org/abs/1807.06514">‘BAM: Bottleneck Attention
|
||
Module’</a>)</td>
|
||
<td style="text-align: center;">25.92M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">23.68</td>
|
||
<td style="text-align: center;">6.96</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">CBAM-ResNet-50 (<a
|
||
href="https://arxiv.org/abs/1807.06521">‘CBAM: Convolutional Block
|
||
Attention Module’</a>)</td>
|
||
<td style="text-align: center;">28.1M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">23.02</td>
|
||
<td style="text-align: center;">6.38</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">1.0-SqNxt-23v5 (<a
|
||
href="https://arxiv.org/abs/1803.10615">‘SqueezeNext: Hardware-Aware
|
||
Neural Network Design’</a>)</td>
|
||
<td style="text-align: center;">921,816</td>
|
||
<td style="text-align: center;">285.82M</td>
|
||
<td style="text-align: center;">40.77</td>
|
||
<td style="text-align: center;">17.85</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">1.5-SqNxt-23v5 (<a
|
||
href="https://arxiv.org/abs/1803.10615">‘SqueezeNext: Hardware-Aware
|
||
Neural Network Design’</a>)</td>
|
||
<td style="text-align: center;">1,953,616</td>
|
||
<td style="text-align: center;">550.97M</td>
|
||
<td style="text-align: center;">33.81</td>
|
||
<td style="text-align: center;">13.01</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">2.0-SqNxt-23v5 (<a
|
||
href="https://arxiv.org/abs/1803.10615">‘SqueezeNext: Hardware-Aware
|
||
Neural Network Design’</a>)</td>
|
||
<td style="text-align: center;">3,366,344</td>
|
||
<td style="text-align: center;">897.60M</td>
|
||
<td style="text-align: center;">29.63</td>
|
||
<td style="text-align: center;">10.66</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ShuffleNetV2 (<a
|
||
href="https://arxiv.org/abs/1807.11164">‘ShuffleNet V2: Practical
|
||
Guidelines for Efficient CNN Architecture Design’</a>)</td>
|
||
<td style="text-align: center;">2,278,604</td>
|
||
<td style="text-align: center;">149.72M</td>
|
||
<td style="text-align: center;">31.44</td>
|
||
<td style="text-align: center;">11.63</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">456-MENet-24×1(g=3) (<a
|
||
href="https://arxiv.org/abs/1803.09127">‘Merging and Evolution:
|
||
Improving Convolutional Neural Networks for Mobile
|
||
Applications’</a>)</td>
|
||
<td style="text-align: center;">5.3M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">28.4</td>
|
||
<td style="text-align: center;">9.8</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">FD-MobileNet (<a
|
||
href="https://arxiv.org/abs/1802.03750">‘FD-MobileNet: Improved
|
||
MobileNet with A Fast Downsampling Strategy’</a>)</td>
|
||
<td style="text-align: center;">2,901,288</td>
|
||
<td style="text-align: center;">147.46M</td>
|
||
<td style="text-align: center;">34.23</td>
|
||
<td style="text-align: center;">13.38</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">MobileNetV2 (<a
|
||
href="https://arxiv.org/abs/1801.04381">‘MobileNetV2: Inverted Residuals
|
||
and Linear Bottlenecks’</a>)</td>
|
||
<td style="text-align: center;">3,504,960</td>
|
||
<td style="text-align: center;">329.36M</td>
|
||
<td style="text-align: center;">26.97</td>
|
||
<td style="text-align: center;">8.87</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">IGCV3 (<a
|
||
href="https://arxiv.org/abs/1806.00178">‘IGCV3: Interleaved Low-Rank
|
||
Group Convolutions for Efficient Deep Neural Networks’</a>)</td>
|
||
<td style="text-align: center;">3.5M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">28.22</td>
|
||
<td style="text-align: center;">9.54</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DARTS (<a
|
||
href="https://arxiv.org/abs/1806.09055">‘DARTS: Differentiable
|
||
Architecture Search’</a>)</td>
|
||
<td style="text-align: center;">4.9M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">26.9</td>
|
||
<td style="text-align: center;">9.0</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">PNASNet-5 (<a
|
||
href="https://arxiv.org/abs/1712.00559">‘Progressive Neural Architecture
|
||
Search’</a>)</td>
|
||
<td style="text-align: center;">5.1M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">25.8</td>
|
||
<td style="text-align: center;">8.1</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">AmoebaNet-C (<a
|
||
href="https://arxiv.org/abs/1802.01548">‘Regularized Evolution for Image
|
||
Classifier Architecture Search’</a>)</td>
|
||
<td style="text-align: center;">5.1M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">24.3</td>
|
||
<td style="text-align: center;">7.6</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">MnasNet (<a
|
||
href="https://arxiv.org/abs/1807.11626">‘MnasNet: Platform-Aware Neural
|
||
Architecture Search for Mobile’</a>)</td>
|
||
<td style="text-align: center;">4,308,816</td>
|
||
<td style="text-align: center;">317.67M</td>
|
||
<td style="text-align: center;">31.58</td>
|
||
<td style="text-align: center;">11.74</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">IBN-Net50-a (<a
|
||
href="https://arxiv.org/abs/1807.09441">‘Two at Once: Enhancing Learning
|
||
andGeneralization Capacities via IBN-Net’</a>)</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">22.54</td>
|
||
<td style="text-align: center;">6.32</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">MarginNet (<a
|
||
href="http://papers.nips.cc/paper/7364-large-margin-deep-networks-for-classification.pdf">‘Large
|
||
Margin Deep Networks for Classification’</a>)</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">22.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">A^2 Net (<a
|
||
href="http://papers.nips.cc/paper/7318-a2-nets-double-attention-networks.pdf">‘A^2-Nets:
|
||
Double Attention Networks’</a>)</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">23.0</td>
|
||
<td style="text-align: center;">6.5</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">FishNeXt-150 (<a
|
||
href="http://papers.nips.cc/paper/7356-fishnet-a-versatile-backbone-for-image-region-and-pixel-level-prediction.pdf">‘FishNet:
|
||
A Versatile Backbone for Image, Region, and Pixel Level
|
||
Prediction’</a>)</td>
|
||
<td style="text-align: center;">26.2M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">21.5</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2018</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">Shape-ResNet (<a
|
||
href="https://arxiv.org/pdf/1811.12231v2.pdf">‘IMAGENET-TRAINED CNNS ARE
|
||
BIASED TOWARDS TEXTURE; INCREASING SHAPE BIAS IMPROVES ACCURACY AND
|
||
ROBUSTNESS’</a>)</td>
|
||
<td style="text-align: center;">25.5M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">23.28</td>
|
||
<td style="text-align: center;">6.72</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">SimCNN(k=3 train) (<a
|
||
href="https://arxiv.org/pdf/1812.11446.pdf">‘Greedy Layerwise Learning
|
||
Can Scale to ImageNet’</a>)</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">28.4</td>
|
||
<td style="text-align: center;">10.2</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">SKNet-50 (<a
|
||
href="https://arxiv.org/pdf/1903.06586.pdf">‘Selective Kernel
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">27.5M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">20.79</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">SRM-ResNet-50 (<a
|
||
href="https://arxiv.org/pdf/1903.10829.pdf">‘SRM : A Style-based
|
||
Recalibration Module for Convolutional Neural Networks’</a>)</td>
|
||
<td style="text-align: center;">25.62M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">22.87</td>
|
||
<td style="text-align: center;">6.49</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">EfficientNet-B0 (<a
|
||
href="http://proceedings.mlr.press/v97/tan19a/tan19a.pdf">‘EfficientNet:
|
||
Rethinking Model Scaling for Convolutional Neural Networks’</a>)</td>
|
||
<td style="text-align: center;">5,288,548</td>
|
||
<td style="text-align: center;">414.31M</td>
|
||
<td style="text-align: center;">24.77</td>
|
||
<td style="text-align: center;">7.52</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">EfficientNet-B7b (<a
|
||
href="http://proceedings.mlr.press/v97/tan19a/tan19a.pdf">‘EfficientNet:
|
||
Rethinking Model Scaling for Convolutional Neural Networks’</a>)</td>
|
||
<td style="text-align: center;">66,347,960</td>
|
||
<td style="text-align: center;">39,010.98M</td>
|
||
<td style="text-align: center;">15.94</td>
|
||
<td style="text-align: center;">3.22</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ProxylessNAS (<a
|
||
href="https://arxiv.org/pdf/1812.00332.pdf">‘PROXYLESSNAS: DIRECT NEURAL
|
||
ARCHITECTURE SEARCH ON TARGET TASK AND HARDWARE’</a>)</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">24.9</td>
|
||
<td style="text-align: center;">7.5</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">MixNet-L (<a
|
||
href="https://arxiv.org/abs/1907.09595">‘MixNet: Mixed Depthwise
|
||
Convolutional Kernels’</a>)</td>
|
||
<td style="text-align: center;">7.3M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">21.1</td>
|
||
<td style="text-align: center;">5.8</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ECA-Net50 (<a
|
||
href="https://arxiv.org/pdf/1910.03151v1.pdf">‘ECA-Net: Efficient
|
||
Channel Attention for Deep Convolutional Neural Networks’</a>)</td>
|
||
<td style="text-align: center;">24.37M</td>
|
||
<td style="text-align: center;">3.86G</td>
|
||
<td style="text-align: center;">22.52</td>
|
||
<td style="text-align: center;">6.32</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ECA-Net101 (<a
|
||
href="https://arxiv.org/pdf/1910.03151v1.pdf">‘ECA-Net: Efficient
|
||
Channel Attention for Deep Convolutional Neural Networks’</a>)</td>
|
||
<td style="text-align: center;">7.3M</td>
|
||
<td style="text-align: center;">7.35G</td>
|
||
<td style="text-align: center;">21.35</td>
|
||
<td style="text-align: center;">5.66</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ACNet-Densenet121 (<a
|
||
href="https://arxiv.org/abs/1908.03930">‘ACNet: Strengthening the Kernel
|
||
Skeletons for Powerful CNN via Asymmetric Convolution Blocks’</a>)</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">24.18</td>
|
||
<td style="text-align: center;">7.23</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">LIP-ResNet-50 (<a
|
||
href="https://arxiv.org/abs/1908.04156">‘LIP: Local Importance-based
|
||
Pooling’</a>)</td>
|
||
<td style="text-align: center;">23.9M</td>
|
||
<td style="text-align: center;">5.33G</td>
|
||
<td style="text-align: center;">21.81</td>
|
||
<td style="text-align: center;">6.04</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">LIP-ResNet-101 (<a
|
||
href="https://arxiv.org/abs/1908.04156">‘LIP: Local Importance-based
|
||
Pooling’</a>)</td>
|
||
<td style="text-align: center;">42.9M</td>
|
||
<td style="text-align: center;">9.06G</td>
|
||
<td style="text-align: center;">20.67</td>
|
||
<td style="text-align: center;">5.40</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">LIP-DenseNet-BC-121 (<a
|
||
href="https://arxiv.org/abs/1908.04156">‘LIP: Local Importance-based
|
||
Pooling’</a>)</td>
|
||
<td style="text-align: center;">8.7M</td>
|
||
<td style="text-align: center;">4.13G</td>
|
||
<td style="text-align: center;">23.36</td>
|
||
<td style="text-align: center;">6.84</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">MuffNet_1.0 (<a
|
||
href="http://openaccess.thecvf.com/content_ICCVW_2019/papers/CEFRL/Chen_MuffNet_Multi-Layer_Feature_Federation_for_Mobile_Deep_Learning_ICCVW_2019_paper.pdf">‘MuffNet:
|
||
Multi-Layer Feature Federation for Mobile Deep Learning’</a>)</td>
|
||
<td style="text-align: center;">2.3M</td>
|
||
<td style="text-align: center;">146M</td>
|
||
<td style="text-align: center;">30.1</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">MuffNet_1.5 (<a
|
||
href="http://openaccess.thecvf.com/content_ICCVW_2019/papers/CEFRL/Chen_MuffNet_Multi-Layer_Feature_Federation_for_Mobile_Deep_Learning_ICCVW_2019_paper.pdf">‘MuffNet:
|
||
Multi-Layer Feature Federation for Mobile Deep Learning’</a>)</td>
|
||
<td style="text-align: center;">3.4M</td>
|
||
<td style="text-align: center;">300M</td>
|
||
<td style="text-align: center;">26.9</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ResNet-34-Bin-5 (<a
|
||
href="https://arxiv.org/abs/1904.11486">‘Making Convolutional Networks
|
||
Shift-Invariant Again’</a>)</td>
|
||
<td style="text-align: center;">21.8M</td>
|
||
<td style="text-align: center;">3,672.68M</td>
|
||
<td style="text-align: center;">25.80</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ResNet-50-Bin-5 (<a
|
||
href="https://arxiv.org/abs/1904.11486">‘Making Convolutional Networks
|
||
Shift-Invariant Again’</a>)</td>
|
||
<td style="text-align: center;">25.5M</td>
|
||
<td style="text-align: center;">3,877.95M</td>
|
||
<td style="text-align: center;">22.96</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">MobileNetV2-Bin-5 (<a
|
||
href="https://arxiv.org/abs/1904.11486">‘Making Convolutional Networks
|
||
Shift-Invariant Again’</a>)</td>
|
||
<td style="text-align: center;">3,504,960</td>
|
||
<td style="text-align: center;">329.36M</td>
|
||
<td style="text-align: center;">27.50</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">FixRes ResNeXt101 WSL (<a
|
||
href="https://arxiv.org/abs/1906.06423">‘Fixing the train-test
|
||
resolution discrepancy’</a>)</td>
|
||
<td style="text-align: center;">829M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">13.6</td>
|
||
<td style="text-align: center;">2.0</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">Noisy Student*(L2) (<a
|
||
href="https://arxiv.org/abs/1911.04252">‘Self-training with Noisy
|
||
Student improves ImageNet classification’</a>)</td>
|
||
<td style="text-align: center;">480M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">12.6</td>
|
||
<td style="text-align: center;">1.8</td>
|
||
<td style="text-align: center;">2019</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">TResNet-M (<a
|
||
href="https://arxiv.org/abs/2003.13630">‘TResNet: High Performance
|
||
GPU-Dedicated Architecture’</a>)</td>
|
||
<td style="text-align: center;">29.4M</td>
|
||
<td style="text-align: center;">5.5G</td>
|
||
<td style="text-align: center;">19.3</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DA-NAS-C (<a
|
||
href="https://arxiv.org/abs/2003.12563v1">‘DA-NAS: Data Adapted Pruning
|
||
for Efficient Neural Architecture Search’</a>)</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">467M</td>
|
||
<td style="text-align: center;">23.8</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ResNeSt-50 (<a
|
||
href="https://arxiv.org/abs/2004.08955">‘ResNeSt: Split-Attention
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">27.5M</td>
|
||
<td style="text-align: center;">5.39G</td>
|
||
<td style="text-align: center;">18.87</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ResNeSt-101 (<a
|
||
href="https://arxiv.org/abs/2004.08955">‘ResNeSt: Split-Attention
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">48.3M</td>
|
||
<td style="text-align: center;">10.2G</td>
|
||
<td style="text-align: center;">17.73</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ResNet-50-FReLU (<a
|
||
href="https://arxiv.org/abs/2007.11824v2">‘Funnel Activation for Visual
|
||
Recognition’</a>)</td>
|
||
<td style="text-align: center;">25.5M</td>
|
||
<td style="text-align: center;">3.87G</td>
|
||
<td style="text-align: center;">22.40</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ResNet-101-FReLU (<a
|
||
href="https://arxiv.org/abs/2007.11824v2">‘Funnel Activation for Visual
|
||
Recognition’</a>)</td>
|
||
<td style="text-align: center;">44.5M</td>
|
||
<td style="text-align: center;">7.6G</td>
|
||
<td style="text-align: center;">22.10</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ResNet-50-MEALv2 (<a
|
||
href="https://arxiv.org/abs/2009.08453v1">‘MEAL V2: Boosting Vanilla
|
||
ResNet-50 to 80%+ Top-1 Accuracy on ImageNet without Tricks’</a>)</td>
|
||
<td style="text-align: center;">25.6M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">19.33</td>
|
||
<td style="text-align: center;">4.91</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ResNet-50-MEALv2 + CutMix (<a
|
||
href="https://arxiv.org/abs/2009.08453v1">‘MEAL V2: Boosting Vanilla
|
||
ResNet-50 to 80%+ Top-1 Accuracy on ImageNet without Tricks’</a>)</td>
|
||
<td style="text-align: center;">25.6M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">19.02</td>
|
||
<td style="text-align: center;">4.65</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">MobileNet V3-Large-MEALv2 (<a
|
||
href="https://arxiv.org/abs/2009.08453v1">‘MEAL V2: Boosting Vanilla
|
||
ResNet-50 to 80%+ Top-1 Accuracy on ImageNet without Tricks’</a>)</td>
|
||
<td style="text-align: center;">5.48M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">23.08</td>
|
||
<td style="text-align: center;">6.68</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">EfficientNet-B0-MEALv2 (<a
|
||
href="https://arxiv.org/abs/2009.08453v1">‘MEAL V2: Boosting Vanilla
|
||
ResNet-50 to 80%+ Top-1 Accuracy on ImageNet without Tricks’</a>)</td>
|
||
<td style="text-align: center;">5.29M</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">21.71</td>
|
||
<td style="text-align: center;">6.05</td>
|
||
<td style="text-align: center;">2020</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">T2T-ViT-7 (<a
|
||
href="https://arxiv.org/abs/2101.11986v1">‘Tokens-to-Token ViT: Training
|
||
Vision Transformers from Scratch on ImageNet’</a>)</td>
|
||
<td style="text-align: center;">4.2M</td>
|
||
<td style="text-align: center;">0.6G</td>
|
||
<td style="text-align: center;">28.8</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">T2T-ViT-14 (<a
|
||
href="https://arxiv.org/abs/2101.11986v1">‘Tokens-to-Token ViT: Training
|
||
Vision Transformers from Scratch on ImageNet’</a>)</td>
|
||
<td style="text-align: center;">19.4M</td>
|
||
<td style="text-align: center;">4.8G</td>
|
||
<td style="text-align: center;">19.4</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">T2T-ViT-19 (<a
|
||
href="https://arxiv.org/abs/2101.11986v1">‘Tokens-to-Token ViT: Training
|
||
Vision Transformers from Scratch on ImageNet’</a>)</td>
|
||
<td style="text-align: center;">39.0M</td>
|
||
<td style="text-align: center;">8.0G</td>
|
||
<td style="text-align: center;">18.8</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">NFNet-F0 (<a
|
||
href="https://arxiv.org/abs/2102.06171">‘High-Performance Large-Scale
|
||
Image Recognition Without Normalization’</a>)</td>
|
||
<td style="text-align: center;">71.5M</td>
|
||
<td style="text-align: center;">12.38G</td>
|
||
<td style="text-align: center;">16.4</td>
|
||
<td style="text-align: center;">3.2</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">NFNet-F1 (<a
|
||
href="https://arxiv.org/abs/2102.06171">‘High-Performance Large-Scale
|
||
Image Recognition Without Normalization’</a>)</td>
|
||
<td style="text-align: center;">132.6M</td>
|
||
<td style="text-align: center;">35.54G</td>
|
||
<td style="text-align: center;">15.4</td>
|
||
<td style="text-align: center;">2.9</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">NFNet-F6+SAM (<a
|
||
href="https://arxiv.org/abs/2102.06171">‘High-Performance Large-Scale
|
||
Image Recognition Without Normalization’</a>)</td>
|
||
<td style="text-align: center;">438.4M</td>
|
||
<td style="text-align: center;">377.28G</td>
|
||
<td style="text-align: center;">13.5</td>
|
||
<td style="text-align: center;">2.1</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">EfficientNetV2-S (<a
|
||
href="https://arxiv.org/abs/2104.00298">‘EfficientNetV2: Smaller Models
|
||
and Faster Training’</a>)</td>
|
||
<td style="text-align: center;">24M</td>
|
||
<td style="text-align: center;">8.8G</td>
|
||
<td style="text-align: center;">16.1</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">EfficientNetV2-M (<a
|
||
href="https://arxiv.org/abs/2104.00298">‘EfficientNetV2: Smaller Models
|
||
and Faster Training’</a>)</td>
|
||
<td style="text-align: center;">55M</td>
|
||
<td style="text-align: center;">24G</td>
|
||
<td style="text-align: center;">14.9</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">EfficientNetV2-L (<a
|
||
href="https://arxiv.org/abs/2104.00298">‘EfficientNetV2: Smaller Models
|
||
and Faster Training’</a>)</td>
|
||
<td style="text-align: center;">121M</td>
|
||
<td style="text-align: center;">53G</td>
|
||
<td style="text-align: center;">14.3</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">EfficientNetV2-S (21k) (<a
|
||
href="https://arxiv.org/abs/2104.00298">‘EfficientNetV2: Smaller Models
|
||
and Faster Training’</a>)</td>
|
||
<td style="text-align: center;">24M</td>
|
||
<td style="text-align: center;">8.8G</td>
|
||
<td style="text-align: center;">15.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">EfficientNetV2-M (21k) (<a
|
||
href="https://arxiv.org/abs/2104.00298">‘EfficientNetV2: Smaller Models
|
||
and Faster Training’</a>)</td>
|
||
<td style="text-align: center;">55M</td>
|
||
<td style="text-align: center;">24G</td>
|
||
<td style="text-align: center;">13.9</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">EfficientNetV2-L (21k) (<a
|
||
href="https://arxiv.org/abs/2104.00298">‘EfficientNetV2: Smaller Models
|
||
and Faster Training’</a>)</td>
|
||
<td style="text-align: center;">121M</td>
|
||
<td style="text-align: center;">53G</td>
|
||
<td style="text-align: center;">13.2</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">2021</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<h2 id="segmentation-models">Segmentation models</h2>
|
||
<table>
|
||
<colgroup>
|
||
<col style="width: 68%" />
|
||
<col style="width: 1%" />
|
||
<col style="width: 5%" />
|
||
<col style="width: 5%" />
|
||
<col style="width: 6%" />
|
||
<col style="width: 3%" />
|
||
<col style="width: 6%" />
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th style="text-align: center;">Model</th>
|
||
<th style="text-align: center;">Year</th>
|
||
<th style="text-align: center;">PASCAL-Context</th>
|
||
<th style="text-align: center;">Cityscapes (mIOU)</th>
|
||
<th style="text-align: center;">PASCAL VOC 2012 (mIOU)</th>
|
||
<th style="text-align: center;">COCO Stuff</th>
|
||
<th style="text-align: center;">ADE20K VAL (mIOU)</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">U-Net (<a
|
||
href="https://arxiv.org/pdf/1505.04597.pdf">‘U-Net: Convolutional
|
||
Networks for Biomedical Image Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2015</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DeconvNet (<a
|
||
href="https://arxiv.org/pdf/1505.04366.pdf">‘Learning Deconvolution
|
||
Network for Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2015</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">72.5</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ParseNet (<a
|
||
href="https://arxiv.org/abs/1506.04579">‘ParseNet: Looking Wider to See
|
||
Better’</a>)</td>
|
||
<td style="text-align: center;">2015</td>
|
||
<td style="text-align: center;">40.4</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">69.8</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">Piecewise (<a
|
||
href="https://arxiv.org/abs/1504.01013">‘Efficient piecewise training of
|
||
deep structured models for semantic segmentation’</a>)</td>
|
||
<td style="text-align: center;">2015</td>
|
||
<td style="text-align: center;">43.3</td>
|
||
<td style="text-align: center;">71.6</td>
|
||
<td style="text-align: center;">78.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">SegNet (<a
|
||
href="https://arxiv.org/pdf/1511.00561.pdf">‘SegNet: A Deep
|
||
Convolutional Encoder-Decoder Architecture for Image
|
||
Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">56.1</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">FCN (<a
|
||
href="https://arxiv.org/pdf/1605.06211.pdf">‘Fully Convolutional
|
||
Networks for Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">37.8</td>
|
||
<td style="text-align: center;">65.3</td>
|
||
<td style="text-align: center;">62.2</td>
|
||
<td style="text-align: center;">22.7</td>
|
||
<td style="text-align: center;">29.39</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ENet (<a
|
||
href="https://arxiv.org/pdf/1606.02147.pdf">‘ENet: A Deep Neural Network
|
||
Architecture for Real-Time Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">58.3</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DilatedNet (<a
|
||
href="https://arxiv.org/pdf/1511.07122.pdf">‘MULTI-SCALE CONTEXT
|
||
AGGREGATION BY DILATED CONVOLUTIONS’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">67.6</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">32.31</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">PixelNet (<a
|
||
href="https://arxiv.org/pdf/1609.06694.pdf">‘PixelNet: Towards a General
|
||
Pixel-Level Architecture’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">69.8</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">RefineNet (<a
|
||
href="https://arxiv.org/pdf/1611.06612.pdf">‘RefineNet: Multi-Path
|
||
Refinement Networks for High-Resolution Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">47.3</td>
|
||
<td style="text-align: center;">73.6</td>
|
||
<td style="text-align: center;">83.4</td>
|
||
<td style="text-align: center;">33.6</td>
|
||
<td style="text-align: center;">40.70</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">LRR (<a
|
||
href="https://arxiv.org/pdf/1605.02264.pdf">‘Laplacian Pyramid
|
||
Reconstruction and Refinement for Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">71.8</td>
|
||
<td style="text-align: center;">79.3</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">FRRN (<a
|
||
href="https://arxiv.org/pdf/1611.08323.pdf">‘Full-Resolution Residual
|
||
Networks for Semantic Segmentation in Street Scenes’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">71.8</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">MultiNet (<a
|
||
href="https://arxiv.org/pdf/1612.07695.pdf">‘MultiNet: Real-time Joint
|
||
Semantic Reasoning for Autonomous Driving’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DeepLab (<a
|
||
href="https://arxiv.org/pdf/1606.00915.pdf">‘DeepLab: Semantic Image
|
||
Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully
|
||
Connected CRFs’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">45.7</td>
|
||
<td style="text-align: center;">64.8</td>
|
||
<td style="text-align: center;">79.7</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">LinkNet (<a
|
||
href="https://arxiv.org/pdf/1707.03718.pdf">‘LinkNet: Exploiting Encoder
|
||
Representations for Efficient Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">Tiramisu (<a
|
||
href="https://arxiv.org/pdf/1611.09326.pdf">‘The One Hundred Layers
|
||
Tiramisu: Fully Convolutional DenseNets for Semantic
|
||
Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ICNet (<a
|
||
href="https://arxiv.org/pdf/1704.08545.pdf">‘ICNet for Real-Time
|
||
Semantic Segmentation on High-Resolution Images’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">70.6</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ERFNet (<a
|
||
href="http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17iv.pdf">‘Efficient
|
||
ConvNet for Real-time Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">68.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">PSPNet (<a
|
||
href="https://arxiv.org/pdf/1612.01105.pdf">‘Pyramid Scene Parsing
|
||
Network’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">47.8</td>
|
||
<td style="text-align: center;">80.2</td>
|
||
<td style="text-align: center;">85.4</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">44.94</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">GCN (<a
|
||
href="https://arxiv.org/pdf/1703.02719.pdf">‘Large Kernel Matters —
|
||
Improve Semantic Segmentation by Global Convolutional Network’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">76.9</td>
|
||
<td style="text-align: center;">82.2</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">Segaware (<a
|
||
href="https://arxiv.org/pdf/1708.04607.pdf">‘Segmentation-Aware
|
||
Convolutional Networks Using Local Attention Masks’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">69.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">PixelDCN (<a
|
||
href="https://arxiv.org/pdf/1705.06820.pdf">‘PIXEL DECONVOLUTIONAL
|
||
NETWORKS’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">73.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DeepLabv3 (<a
|
||
href="https://arxiv.org/pdf/1706.05587.pdf">‘Rethinking Atrous
|
||
Convolution for Semantic Image Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">85.7</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DUC, HDC (<a
|
||
href="https://arxiv.org/pdf/1702.08502.pdf">‘Understanding Convolution
|
||
for Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">77.1</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ShuffleSeg (<a
|
||
href="https://arxiv.org/pdf/1803.03816.pdf">‘SHUFFLESEG: REAL-TIME
|
||
SEMANTIC SEGMENTATION NETWORK’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">59.3</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">AdaptSegNet (<a
|
||
href="https://arxiv.org/pdf/1802.10349.pdf">‘Learning to Adapt
|
||
Structured Output Space for Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">46.7</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">TuSimple-DUC (<a
|
||
href="https://arxiv.org/pdf/1702.08502.pdf">‘Understanding Convolution
|
||
for Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">80.1</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">83.1</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">R2U-Net (<a
|
||
href="https://arxiv.org/pdf/1802.06955.pdf">‘Recurrent Residual
|
||
Convolutional Neural Network based on U-Net (R2U-Net) for Medical Image
|
||
Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">Attention U-Net (<a
|
||
href="https://arxiv.org/pdf/1804.03999.pdf">‘Attention U-Net: Learning
|
||
Where to Look for the Pancreas’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DANet (<a
|
||
href="https://arxiv.org/pdf/1809.02983.pdf">‘Dual Attention Network for
|
||
Scene Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">52.6</td>
|
||
<td style="text-align: center;">81.5</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">39.7</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ENCNet (<a
|
||
href="https://arxiv.org/abs/1803.08904">‘Context Encoding for Semantic
|
||
Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">51.7</td>
|
||
<td style="text-align: center;">75.8</td>
|
||
<td style="text-align: center;">85.9</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">44.65</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ShelfNet (<a
|
||
href="https://arxiv.org/pdf/1811.11254.pdf">‘ShelfNet for Real-time
|
||
Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">48.4</td>
|
||
<td style="text-align: center;">75.8</td>
|
||
<td style="text-align: center;">84.2</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">LadderNet (<a
|
||
href="https://arxiv.org/pdf/1810.07810.pdf">‘LADDERNET: MULTI-PATH
|
||
NETWORKS BASED ON U-NET FOR MEDICAL IMAGE SEGMENTATION’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">CCC-ERFnet (<a
|
||
href="https://arxiv.org/pdf/1812.04920v1.pdf">‘Concentrated-Comprehensive
|
||
Convolutions for lightweight semantic segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">69.01</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DifNet-101 (<a
|
||
href="http://papers.nips.cc/paper/7435-difnet-semantic-segmentation-by-diffusion-networks.pdf">‘DifNet:
|
||
Semantic Segmentation by Diffusion Networks’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">45.1</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">73.2</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">BiSeNet(Res18) (<a
|
||
href="https://arxiv.org/pdf/1808.00897.pdf">‘BiSeNet: Bilateral
|
||
Segmentation Network for Real-time Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">74.7</td>
|
||
<td style="text-align: center;">28.1</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">ESPNet (<a
|
||
href="https://arxiv.org/pdf/1803.06815.pdf">‘ESPNet: Efficient Spatial
|
||
Pyramid of Dilated Convolutions for Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">63.01</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">SPADE (<a
|
||
href="https://arxiv.org/pdf/1903.07291.pdf">‘Semantic Image Synthesis
|
||
with Spatially-Adaptive Normalization’</a>)</td>
|
||
<td style="text-align: center;">2019</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">62.3</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">37.4</td>
|
||
<td style="text-align: center;">38.5</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">SeamlessSeg (<a
|
||
href="https://arxiv.org/pdf/1905.01220v1.pdf">‘Seamless Scene
|
||
Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2019</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">77.5</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">EMANet (<a
|
||
href="https://arxiv.org/pdf/1907.13426.pdf">‘Expectation-Maximization
|
||
Attention Networks for Semantic Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2019</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">88.2</td>
|
||
<td style="text-align: center;">39.9</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<h2 id="detection-models">Detection models</h2>
|
||
<table>
|
||
<colgroup>
|
||
<col style="width: 79%" />
|
||
<col style="width: 2%" />
|
||
<col style="width: 6%" />
|
||
<col style="width: 6%" />
|
||
<col style="width: 3%" />
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th style="text-align: center;">Model</th>
|
||
<th style="text-align: center;">Year</th>
|
||
<th style="text-align: center;">VOC07 (mAP@IoU=0.5)</th>
|
||
<th style="text-align: center;">VOC12 (mAP@IoU=0.5)</th>
|
||
<th style="text-align: center;">COCO (mAP)</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">R-CNN (<a
|
||
href="https://arxiv.org/pdf/1311.2524.pdf">‘Rich feature hierarchies for
|
||
accurate object detection and semantic segmentation’</a>)</td>
|
||
<td style="text-align: center;">2014</td>
|
||
<td style="text-align: center;">58.5</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">OverFeat (<a
|
||
href="https://arxiv.org/pdf/1312.6229.pdf">‘OverFeat: Integrated
|
||
Recognition, Localization and Detection using Convolutional
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">2014</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">MultiBox (<a
|
||
href="https://www.cv-foundation.org/openaccess/content_cvpr_2014/papers/Erhan_Scalable_Object_Detection_2014_CVPR_paper.pdf">‘Scalable
|
||
Object Detection using Deep Neural Networks’</a>)</td>
|
||
<td style="text-align: center;">2014</td>
|
||
<td style="text-align: center;">29.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">SPP-Net (<a
|
||
href="https://arxiv.org/pdf/1406.4729.pdf">‘Spatial Pyramid Pooling in
|
||
Deep Convolutional Networks for Visual Recognition’</a>)</td>
|
||
<td style="text-align: center;">2014</td>
|
||
<td style="text-align: center;">59.2</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">MR-CNN (<a
|
||
href="https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Gidaris_Object_Detection_via_ICCV_2015_paper.pdf">‘Object
|
||
detection via a multi-region & semantic segmentation-aware CNN
|
||
model’</a>)</td>
|
||
<td style="text-align: center;">2015</td>
|
||
<td style="text-align: center;">78.2</td>
|
||
<td style="text-align: center;">73.9</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">AttentionNet (<a
|
||
href="https://arxiv.org/pdf/1506.07704.pdf">‘AttentionNet: Aggregating
|
||
Weak Directions for Accurate Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2015</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">Fast R-CNN (<a
|
||
href="https://arxiv.org/pdf/1504.08083.pdf">‘Fast R-CNN’</a>)</td>
|
||
<td style="text-align: center;">2015</td>
|
||
<td style="text-align: center;">70.0</td>
|
||
<td style="text-align: center;">68.4</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">Fast R-CNN (<a
|
||
href="https://papers.nips.cc/paper/5638-faster-r-cnn-towards-real-time-object-detection-with-region-proposal-networks.pdf">‘Faster
|
||
R-CNN: Towards Real-Time Object Detection with Region Proposal
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">2015</td>
|
||
<td style="text-align: center;">73.2</td>
|
||
<td style="text-align: center;">70.4</td>
|
||
<td style="text-align: center;">36.8</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">YOLO v1 (<a
|
||
href="https://arxiv.org/pdf/1506.02640.pdf">‘You Only Look Once:
|
||
Unified, Real-Time Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">66.4</td>
|
||
<td style="text-align: center;">57.9</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">G-CNN (<a
|
||
href="https://arxiv.org/pdf/1512.07729.pdf">‘G-CNN: an Iterative Grid
|
||
Based Object Detector’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">66.8</td>
|
||
<td style="text-align: center;">66.4</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">AZNet (<a
|
||
href="https://arxiv.org/pdf/1512.07711.pdf">‘Adaptive Object Detection
|
||
Using Adjacency and Zoom Prediction’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">70.4</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">22.3</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">ION (<a
|
||
href="https://arxiv.org/pdf/1512.04143.pdf">‘Inside-Outside Net:
|
||
Detecting Objects in Context with Skip Pooling and Recurrent Neural
|
||
Networks’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">80.1</td>
|
||
<td style="text-align: center;">77.9</td>
|
||
<td style="text-align: center;">33.1</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">HyperNet (<a
|
||
href="https://arxiv.org/pdf/1604.00600.pdf">‘HyperNet: Towards Accurate
|
||
Region Proposal Generation and Joint Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">76.3</td>
|
||
<td style="text-align: center;">71.4</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">OHEM (<a
|
||
href="https://arxiv.org/pdf/1604.03540.pdf">‘Training Region-based
|
||
Object Detectors with Online Hard Example Mining’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">78.9</td>
|
||
<td style="text-align: center;">76.3</td>
|
||
<td style="text-align: center;">22.4</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">MPN (<a
|
||
href="https://arxiv.org/pdf/1604.02135.pdf">‘A MultiPath Network for
|
||
Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">33.2</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">SSD (<a
|
||
href="https://arxiv.org/pdf/1512.02325.pdf">‘SSD: Single Shot MultiBox
|
||
Detector’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">76.8</td>
|
||
<td style="text-align: center;">74.9</td>
|
||
<td style="text-align: center;">31.2</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">GBDNet (<a
|
||
href="https://arxiv.org/pdf/1610.02579.pdf">‘Crafting GBD-Net for Object
|
||
Detection’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">77.2</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">27.0</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">CPF (<a
|
||
href="https://pdfs.semanticscholar.org/40e7/4473cb82231559cbaeaa44989e9bbfe7ec3f.pdf">‘Contextual
|
||
Priming and Feedback for Faster R-CNN’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">76.4</td>
|
||
<td style="text-align: center;">72.6</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">MS-CNN (<a
|
||
href="https://arxiv.org/pdf/1607.07155.pdf">‘A Unified Multi-scale Deep
|
||
Convolutional Neural Network for Fast Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">R-FCN (<a
|
||
href="https://arxiv.org/pdf/1605.06409.pdf">‘R-FCN: Object Detection via
|
||
Region-based Fully Convolutional Networks’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">79.5</td>
|
||
<td style="text-align: center;">77.6</td>
|
||
<td style="text-align: center;">29.9</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">PVANET (<a
|
||
href="https://arxiv.org/pdf/1608.08021.pdf">‘PVANET: Deep but
|
||
Lightweight Neural Networks for Real-time Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DeepID-Net (<a
|
||
href="https://arxiv.org/pdf/1412.5661.pdf">‘DeepID-Net: Deformable Deep
|
||
Convolutional Neural Networks for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">69.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">NoC (<a
|
||
href="https://arxiv.org/pdf/1504.06066.pdf">‘Object Detection Networks
|
||
on Convolutional Feature Maps’</a>)</td>
|
||
<td style="text-align: center;">2016</td>
|
||
<td style="text-align: center;">71.6</td>
|
||
<td style="text-align: center;">68.8</td>
|
||
<td style="text-align: center;">27.2</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DSSD (<a
|
||
href="https://arxiv.org/pdf/1701.06659.pdf">‘DSSD : Deconvolutional
|
||
Single Shot Detector’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">81.5</td>
|
||
<td style="text-align: center;">80.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">TDM (<a
|
||
href="https://arxiv.org/pdf/1612.06851.pdf">‘Beyond Skip Connections:
|
||
Top-Down Modulation for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">37.3</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">FPN (<a
|
||
href="http://openaccess.thecvf.com/content_cvpr_2017/papers/Lin_Feature_Pyramid_Networks_CVPR_2017_paper.pdf">‘Feature
|
||
Pyramid Networks for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">36.2</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">YOLO v2 (<a
|
||
href="https://arxiv.org/pdf/1612.08242.pdf">‘YOLO9000: Better, Faster,
|
||
Stronger’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">78.6</td>
|
||
<td style="text-align: center;">73.4</td>
|
||
<td style="text-align: center;">21.6</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">RON (<a
|
||
href="https://arxiv.org/pdf/1707.01691.pdf">‘RON: Reverse Connection
|
||
with Objectness Prior Networks for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">77.6</td>
|
||
<td style="text-align: center;">75.4</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DCN (<a
|
||
href="http://openaccess.thecvf.com/content_ICCV_2017/papers/Dai_Deformable_Convolutional_Networks_ICCV_2017_paper.pdf">‘Deformable
|
||
Convolutional Networks’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DeNet (<a
|
||
href="https://arxiv.org/pdf/1703.10295.pdf">‘DeNet: Scalable Real-time
|
||
Object Detection with Directed Sparse Sampling’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">77.1</td>
|
||
<td style="text-align: center;">73.9</td>
|
||
<td style="text-align: center;">33.8</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">CoupleNet (<a
|
||
href="https://arxiv.org/pdf/1708.02863.pdf">‘CoupleNet: Coupling Global
|
||
Structure with Local Parts for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">82.7</td>
|
||
<td style="text-align: center;">80.4</td>
|
||
<td style="text-align: center;">34.4</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">RetinaNet (<a
|
||
href="https://arxiv.org/pdf/1708.02002.pdf">‘Focal Loss for Dense Object
|
||
Detection’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">39.1</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">Mask R-CNN (<a
|
||
href="http://openaccess.thecvf.com/content_ICCV_2017/papers/He_Mask_R-CNN_ICCV_2017_paper.pdf">‘Mask
|
||
R-CNN’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">39.8</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">DSOD (<a
|
||
href="https://arxiv.org/pdf/1708.01241.pdf">‘DSOD: Learning Deeply
|
||
Supervised Object Detectors from Scratch’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">77.7</td>
|
||
<td style="text-align: center;">76.3</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">SMN (<a
|
||
href="http://openaccess.thecvf.com/content_ICCV_2017/papers/Chen_Spatial_Memory_for_ICCV_2017_paper.pdf">‘Spatial
|
||
Memory for Context Reasoning in Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2017</td>
|
||
<td style="text-align: center;">70.0</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">YOLO v3 (<a
|
||
href="https://pjreddie.com/media/files/papers/YOLOv3.pdf">‘YOLOv3: An
|
||
Incremental Improvement’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">33.0</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">SIN (<a
|
||
href="http://openaccess.thecvf.com/content_cvpr_2018/papers/Liu_Structure_Inference_Net_CVPR_2018_paper.pdf">‘Structure
|
||
Inference Net: Object Detection Using Scene-Level Context and
|
||
Instance-Level Relationships’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">76.0</td>
|
||
<td style="text-align: center;">73.1</td>
|
||
<td style="text-align: center;">23.2</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">STDN (<a
|
||
href="http://openaccess.thecvf.com/content_cvpr_2018/papers/Zhou_Scale-Transferrable_Object_Detection_CVPR_2018_paper.pdf">‘Scale-Transferrable
|
||
Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">80.9</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">RefineDet (<a
|
||
href="http://openaccess.thecvf.com/content_cvpr_2018/papers/Zhang_Single-Shot_Refinement_Neural_CVPR_2018_paper.pdf">‘Single-Shot
|
||
Refinement Neural Network for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">83.8</td>
|
||
<td style="text-align: center;">83.5</td>
|
||
<td style="text-align: center;">41.8</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">MegDet (<a
|
||
href="http://openaccess.thecvf.com/content_cvpr_2018/papers/Peng_MegDet_A_Large_CVPR_2018_paper.pdf">‘MegDet:
|
||
A Large Mini-Batch Object Detector’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">RFBNet (<a
|
||
href="https://arxiv.org/pdf/1711.07767.pdf">‘Receptive Field Block Net
|
||
for Accurate and Fast Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">82.2</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">CornerNet (<a
|
||
href="https://arxiv.org/pdf/1808.01244.pdf">‘CornerNet: Detecting
|
||
Objects as Paired Keypoints’</a>)</td>
|
||
<td style="text-align: center;">2018</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">42.1</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">LibraRetinaNet (<a
|
||
href="https://arxiv.org/pdf/1904.02701v1.pdf">‘Libra R-CNN: Towards
|
||
Balanced Learning for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2019</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">43.0</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">YOLACT-700 (<a
|
||
href="https://arxiv.org/pdf/1904.02689v1.pdf">‘YOLACT Real-time Instance
|
||
Segmentation’</a>)</td>
|
||
<td style="text-align: center;">2019</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">31.2</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">DetNASNet(3.8) (<a
|
||
href="https://arxiv.org/pdf/1903.10979v2.pdf">‘DetNAS: Backbone Search
|
||
for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2019</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">42.0</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">YOLOv4 (<a
|
||
href="https://arxiv.org/pdf/2004.10934.pdf">‘YOLOv4: Optimal Speed and
|
||
Accuracy of Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2020</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">46.7</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">SOLO (<a
|
||
href="https://arxiv.org/pdf/1912.04488v3.pdf">‘SOLO: Segmenting Objects
|
||
by Locations’</a>)</td>
|
||
<td style="text-align: center;">2020</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">37.8</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">D-SOLO (<a
|
||
href="https://arxiv.org/pdf/1912.04488v3.pdf">‘SOLO: Segmenting Objects
|
||
by Locations’</a>)</td>
|
||
<td style="text-align: center;">2020</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">40.5</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td style="text-align: center;">SNIPER (<a
|
||
href="https://arxiv.org/pdf/2102.05646v1.pdf">‘Scale Normalized Image
|
||
Pyramids with AutoFocus for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2021</td>
|
||
<td style="text-align: center;">86.6</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">47.9</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td style="text-align: center;">AutoFocus (<a
|
||
href="https://arxiv.org/pdf/2102.05646v1.pdf">‘Scale Normalized Image
|
||
Pyramids with AutoFocus for Object Detection’</a>)</td>
|
||
<td style="text-align: center;">2021</td>
|
||
<td style="text-align: center;">85.8</td>
|
||
<td style="text-align: center;">?</td>
|
||
<td style="text-align: center;">47.9</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><a
|
||
href="https://github.com/nerox8664/awesome-computer-vision-models">computervisionmodels.md
|
||
Github</a></p>
|