primary
This commit is contained in:
parent
163eb2fa72
commit
3cda3f0e62
310
main.bbl
Normal file
310
main.bbl
Normal file
@ -0,0 +1,310 @@
|
||||
% Generated by IEEEtran.bst, version: 1.14 (2015/08/26)
|
||||
\begin{thebibliography}{10}
|
||||
\providecommand{\url}[1]{#1}
|
||||
\csname url@samestyle\endcsname
|
||||
\providecommand{\newblock}{\relax}
|
||||
\providecommand{\bibinfo}[2]{#2}
|
||||
\providecommand{\BIBentrySTDinterwordspacing}{\spaceskip=0pt\relax}
|
||||
\providecommand{\BIBentryALTinterwordstretchfactor}{4}
|
||||
\providecommand{\BIBentryALTinterwordspacing}{\spaceskip=\fontdimen2\font plus
|
||||
\BIBentryALTinterwordstretchfactor\fontdimen3\font minus
|
||||
\fontdimen4\font\relax}
|
||||
\providecommand{\BIBforeignlanguage}[2]{{%
|
||||
\expandafter\ifx\csname l@#1\endcsname\relax
|
||||
\typeout{** WARNING: IEEEtran.bst: No hyphenation pattern has been}%
|
||||
\typeout{** loaded for the language `#1'. Using the pattern for}%
|
||||
\typeout{** the default language instead.}%
|
||||
\else
|
||||
\language=\csname l@#1\endcsname
|
||||
\fi
|
||||
#2}}
|
||||
\providecommand{\BIBdecl}{\relax}
|
||||
\BIBdecl
|
||||
|
||||
\bibitem{adas}
|
||||
A.~Bar~Hillel, R.~Lerner, D.~Levi, and G.~Raz, ``Recent progress in road and
|
||||
lane detection: a survey,'' \emph{Machine vision and applications}, vol.~25,
|
||||
no.~3, pp. 727--745, 2014.
|
||||
|
||||
\bibitem{scnn}
|
||||
X.~Pan, J.~Shi, P.~Luo, X.~Wang, and X.~Tang, ``Spatial as deep: Spatial cnn
|
||||
for traffic scene understanding,'' in \emph{Proceedings of the AAAI
|
||||
conference on artificial intelligence}, vol.~32, no.~1, 2018.
|
||||
|
||||
\bibitem{polylanenet}
|
||||
L.~Tabelini, R.~Berriel, T.~M. Paixao, C.~Badue, A.~F. De~Souza, and
|
||||
T.~Oliveira-Santos, ``Polylanenet: Lane estimation via deep polynomial
|
||||
regression,'' in \emph{2020 25th International Conference on Pattern
|
||||
Recognition (ICPR)}.\hskip 1em plus 0.5em minus 0.4em\relax IEEE, 2021, pp.
|
||||
6150--6156.
|
||||
|
||||
\bibitem{cannyedge}
|
||||
J.~Canny, ``A computational approach to edge detection,'' \emph{IEEE
|
||||
Transactions on pattern analysis and machine intelligence}, no.~6, pp.
|
||||
679--698, 1986.
|
||||
|
||||
\bibitem{houghtransform}
|
||||
J.~Illingworth and J.~Kittler, ``A survey of the hough transform,''
|
||||
\emph{Computer vision, graphics, and image processing}, vol.~44, no.~1, pp.
|
||||
87--116, 1988.
|
||||
|
||||
\bibitem{kluge1995deformable}
|
||||
K.~Kluge and S.~Lakshmanan, ``A deformable-template approach to lane
|
||||
detection,'' in \emph{Proceedings of the Intelligent Vehicles' 95.
|
||||
Symposium}.\hskip 1em plus 0.5em minus 0.4em\relax IEEE, 1995, pp. 54--59.
|
||||
|
||||
\bibitem{lstr}
|
||||
R.~Liu, Z.~Yuan, T.~Liu, and Z.~Xiong, ``End-to-end lane shape prediction with
|
||||
transformers,'' in \emph{Proceedings of the IEEE/CVF winter conference on
|
||||
applications of computer vision}, 2021, pp. 3694--3702.
|
||||
|
||||
\bibitem{lanenet}
|
||||
Z.~Wang, W.~Ren, and Q.~Qiu, ``Lanenet: Real-time lane detection networks for
|
||||
autonomous driving,'' \emph{arXiv preprint arXiv:1807.01726}, 2018.
|
||||
|
||||
\bibitem{bezierlanenet}
|
||||
Z.~Feng, S.~Guo, X.~Tan, K.~Xu, M.~Wang, and L.~Ma, ``Rethinking efficient lane
|
||||
detection via curve modeling,'' in \emph{Proceedings of the IEEE/CVF
|
||||
Conference on Computer Vision and Pattern Recognition}, 2022, pp.
|
||||
17\,062--17\,070.
|
||||
|
||||
\bibitem{yolov10}
|
||||
A.~Wang, H.~Chen, L.~Liu, K.~Chen, Z.~Lin, J.~Han, and G.~Ding, ``Yolov10:
|
||||
Real-time end-to-end object detection,'' \emph{arXiv preprint
|
||||
arXiv:2405.14458}, 2024.
|
||||
|
||||
\bibitem{fasterrcnn}
|
||||
S.~Ren, K.~He, R.~Girshick, and J.~Sun, ``Faster r-cnn: Towards real-time
|
||||
object detection with region proposal networks,'' \emph{IEEE transactions on
|
||||
pattern analysis and machine intelligence}, vol.~39, no.~6, pp. 1137--1149,
|
||||
2016.
|
||||
|
||||
\bibitem{laneatt}
|
||||
L.~Tabelini, R.~Berriel, T.~M. Paixao, C.~Badue, A.~F. De~Souza, and
|
||||
T.~Oliveira-Santos, ``Keep your eyes on the lane: Real-time attention-guided
|
||||
lane detection,'' in \emph{Proceedings of the IEEE/CVF conference on computer
|
||||
vision and pattern recognition}, 2021, pp. 294--302.
|
||||
|
||||
\bibitem{clrnet}
|
||||
T.~Zheng, Y.~Huang, Y.~Liu, W.~Tang, Z.~Yang, D.~Cai, and X.~He, ``Clrnet:
|
||||
Cross layer refinement network for lane detection,'' in \emph{Proceedings of
|
||||
the IEEE/CVF conference on computer vision and pattern recognition}, 2022,
|
||||
pp. 898--907.
|
||||
|
||||
\bibitem{nms}
|
||||
A.~Neubeck and L.~Van~Gool, ``Efficient non-maximum suppression,'' in
|
||||
\emph{18th international conference on pattern recognition (ICPR'06)},
|
||||
vol.~3.\hskip 1em plus 0.5em minus 0.4em\relax IEEE, 2006, pp. 850--855.
|
||||
|
||||
\bibitem{adnet}
|
||||
L.~Xiao, X.~Li, S.~Yang, and W.~Yang, ``Adnet: Lane shape prediction via anchor
|
||||
decomposition,'' in \emph{Proceedings of the IEEE/CVF International
|
||||
Conference on Computer Vision}, 2023, pp. 6404--6413.
|
||||
|
||||
\bibitem{srlane}
|
||||
C.~Chen, J.~Liu, C.~Zhou, J.~Tang, and G.~Wu, ``Sketch and refine: Towards fast
|
||||
and accurate lane detection,'' in \emph{Proceedings of the AAAI Conference on
|
||||
Artificial Intelligence}, vol.~38, no.~2, 2024, pp. 1001--1009.
|
||||
|
||||
\bibitem{clrernet}
|
||||
H.~Honda and Y.~Uchida, ``Clrernet: improving confidence of lane detection with
|
||||
laneiou,'' in \emph{Proceedings of the IEEE/CVF Winter Conference on
|
||||
Applications of Computer Vision}, 2024, pp. 1176--1185.
|
||||
|
||||
\bibitem{gnn}
|
||||
Z.~Wu, S.~Pan, F.~Chen, G.~Long, C.~Zhang, and S.~Y. Philip, ``A comprehensive
|
||||
survey on graph neural networks,'' \emph{IEEE transactions on neural networks
|
||||
and learning systems}, vol.~32, no.~1, pp. 4--24, 2020.
|
||||
|
||||
\bibitem{tusimple}
|
||||
\BIBentryALTinterwordspacing
|
||||
{TuSimple}, ``Tusimple benchmark,'' 2020, accessed: September 2020. [Online].
|
||||
Available: \url{https://github.com/TuSimple/tusimple-benchmark/}
|
||||
\BIBentrySTDinterwordspacing
|
||||
|
||||
\bibitem{llamas}
|
||||
K.~Behrendt and R.~Soussan, ``Unsupervised labeled lane markers using maps,''
|
||||
in \emph{Proceedings of the IEEE/CVF international conference on computer
|
||||
vision workshops}, 2019, pp. 0--0.
|
||||
|
||||
\bibitem{curvelanes}
|
||||
H.~Xu, S.~Wang, X.~Cai, W.~Zhang, X.~Liang, and Z.~Li, ``Curvelane-nas:
|
||||
Unifying lane-sensitive architecture search and adaptive point blending,'' in
|
||||
\emph{Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK,
|
||||
August 23--28, 2020, Proceedings, Part XV 16}.\hskip 1em plus 0.5em minus
|
||||
0.4em\relax Springer, 2020, pp. 689--704.
|
||||
|
||||
\bibitem{dalnet}
|
||||
Z.~Yu, Q.~Liu, W.~Wang, L.~Zhang, and X.~Zhao, ``Dalnet: A rail detection
|
||||
network based on dynamic anchor line,'' \emph{IEEE Transactions on
|
||||
Instrumentation and Measurement}, 2024.
|
||||
|
||||
\bibitem{ufld}
|
||||
Z.~Qin, H.~Wang, and X.~Li, ``Ultra fast structure-aware deep lane detection,''
|
||||
in \emph{Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK,
|
||||
August 23--28, 2020, Proceedings, Part XXIV 16}.\hskip 1em plus 0.5em minus
|
||||
0.4em\relax Springer, 2020, pp. 276--291.
|
||||
|
||||
\bibitem{ufldv2}
|
||||
Z.~Qin, P.~Zhang, and X.~Li, ``Ultra fast deep lane detection with hybrid
|
||||
anchor driven ordinal classification,'' \emph{IEEE transactions on pattern
|
||||
analysis and machine intelligence}, vol.~46, no.~5, pp. 2555--2568, 2022.
|
||||
|
||||
\bibitem{CondLaneNet}
|
||||
L.~Liu, X.~Chen, S.~Zhu, and P.~Tan, ``Condlanenet: a top-to-down lane
|
||||
detection framework based on conditional convolution,'' in \emph{Proceedings
|
||||
of the IEEE/CVF international conference on computer vision}, 2021, pp.
|
||||
3773--3782.
|
||||
|
||||
\bibitem{fololane}
|
||||
Z.~Qin, P.~Zhang, and X.~Li, ``Ultra fast deep lane detection with hybrid
|
||||
anchor driven ordinal classification,'' \emph{IEEE transactions on pattern
|
||||
analysis and machine intelligence}, vol.~46, no.~5, pp. 2555--2568, 2022.
|
||||
|
||||
\bibitem{ganet}
|
||||
M.~Morley, R.~Atkinson, D.~Savi{\'c}, and G.~Walters, ``Ganet: genetic
|
||||
algorithm platform for pipe network optimisation,'' \emph{Advances in
|
||||
engineering software}, vol.~32, no.~6, pp. 467--475, 2001.
|
||||
|
||||
\bibitem{bsnet}
|
||||
H.~Chen, M.~Wang, and Y.~Liu, ``Bsnet: Lane detection via draw b-spline curves
|
||||
nearby,'' \emph{arXiv preprint arXiv:2301.06910}, 2023.
|
||||
|
||||
\bibitem{yolox}
|
||||
G.~Zheng, L.~Songtao, W.~Feng, L.~Zeming, S.~Jian \emph{et~al.}, ``Yolox:
|
||||
Exceeding yolo series in 2021,'' \emph{arXiv preprint arXiv:2107.08430},
|
||||
2021.
|
||||
|
||||
\bibitem{sparse}
|
||||
J.~Liu, Z.~Zhang, M.~Lu, H.~Wei, D.~Li, Y.~Xie, J.~Peng, L.~Tian, A.~Sirasao,
|
||||
and E.~Barsoum, ``Sparse laneformer,'' \emph{arXiv preprint
|
||||
arXiv:2404.07821}, 2024.
|
||||
|
||||
\bibitem{detr}
|
||||
N.~Carion, F.~Massa, G.~Synnaeve, N.~Usunier, A.~Kirillov, and S.~Zagoruyko,
|
||||
``End-to-end object detection with transformers,'' in \emph{European
|
||||
conference on computer vision}.\hskip 1em plus 0.5em minus 0.4em\relax
|
||||
Springer, 2020, pp. 213--229.
|
||||
|
||||
\bibitem{o2o}
|
||||
P.~Sun, Y.~Jiang, E.~Xie, W.~Shao, Z.~Yuan, C.~Wang, and P.~Luo, ``What makes
|
||||
for end-to-end object detection?'' in \emph{International Conference on
|
||||
Machine Learning}.\hskip 1em plus 0.5em minus 0.4em\relax PMLR, 2021, pp.
|
||||
9934--9944.
|
||||
|
||||
\bibitem{learnNMS}
|
||||
J.~Hosang, R.~Benenson, and B.~Schiele, ``Learning non-maximum suppression,''
|
||||
in \emph{Proceedings of the IEEE conference on computer vision and pattern
|
||||
recognition}, 2017, pp. 4507--4515.
|
||||
|
||||
\bibitem{date}
|
||||
Y.~Chen, Q.~Chen, Q.~Hu, and J.~Cheng, ``Date: Dual assignment for end-to-end
|
||||
fully convolutional object detection,'' \emph{arXiv preprint
|
||||
arXiv:2211.13859}, 2022.
|
||||
|
||||
\bibitem{o3d}
|
||||
J.~Wang, L.~Song, Z.~Li, H.~Sun, J.~Sun, and N.~Zheng, ``End-to-end object
|
||||
detection with fully convolutional network,'' in \emph{Proceedings of the
|
||||
IEEE/CVF conference on computer vision and pattern recognition}, 2021, pp.
|
||||
15\,849--15\,858.
|
||||
|
||||
\bibitem{relationnet}
|
||||
H.~Hu, J.~Gu, Z.~Zhang, J.~Dai, and Y.~Wei, ``Relation networks for object
|
||||
detection,'' in \emph{Proceedings of the IEEE conference on computer vision
|
||||
and pattern recognition}, 2018, pp. 3588--3597.
|
||||
|
||||
\bibitem{resnet}
|
||||
K.~He, X.~Zhang, S.~Ren, and J.~Sun, ``Deep residual learning for image
|
||||
recognition,'' in \emph{Proceedings of the IEEE conference on computer vision
|
||||
and pattern recognition}, 2016, pp. 770--778.
|
||||
|
||||
\bibitem{fpn}
|
||||
T.-Y. Lin, P.~Doll{\'a}r, R.~Girshick, K.~He, B.~Hariharan, and S.~Belongie,
|
||||
``Feature pyramid networks for object detection,'' in \emph{Proceedings of
|
||||
the IEEE conference on computer vision and pattern recognition}, 2017, pp.
|
||||
2117--2125.
|
||||
|
||||
\bibitem{linecnn}
|
||||
X.~Li, J.~Li, X.~Hu, and J.~Yang, ``Line-cnn: End-to-end traffic line detection
|
||||
with line proposal unit,'' \emph{IEEE Transactions on Intelligent
|
||||
Transportation Systems}, vol.~21, no.~1, pp. 248--258, 2019.
|
||||
|
||||
\bibitem{vanishing}
|
||||
Y.~Su, Y.~Zhang, T.~Lu, J.~Yang, and H.~Kong, ``Vanishing point constrained
|
||||
lane detection with a stereo camera,'' \emph{IEEE Transactions on Intelligent
|
||||
Transportation Systems}, vol.~19, no.~8, pp. 2739--2744, 2017.
|
||||
|
||||
\bibitem{dualassign}
|
||||
S.~Li, C.~He, R.~Li, and L.~Zhang, ``A dual weighting label assignment scheme
|
||||
for object detection,'' in \emph{Proceedings of the IEEE/CVF conference on
|
||||
computer vision and pattern recognition}, 2022, pp. 9387--9396.
|
||||
|
||||
\bibitem{yolact}
|
||||
D.~Bolya, C.~Zhou, F.~Xiao, and Y.~J. Lee, ``Yolact: Real-time instance
|
||||
segmentation,'' in \emph{Proceedings of the IEEE/CVF international conference
|
||||
on computer vision}, 2019, pp. 9157--9166.
|
||||
|
||||
\bibitem{pointnet}
|
||||
C.~R. Qi, H.~Su, K.~Mo, and L.~J. Guibas, ``Pointnet: Deep learning on point
|
||||
sets for 3d classification and segmentation,'' in \emph{Proceedings of the
|
||||
IEEE conference on computer vision and pattern recognition}, 2017, pp.
|
||||
652--660.
|
||||
|
||||
\bibitem{focal}
|
||||
T.-Y. Lin, P.~Goyal, R.~Girshick, K.~He, and P.~Doll{\'a}r, ``Focal loss for
|
||||
dense object detection,'' in \emph{Proceedings of the IEEE international
|
||||
conference on computer vision}, 2017, pp. 2980--2988.
|
||||
|
||||
\bibitem{pss}
|
||||
Q.~Zhou and C.~Yu, ``Object detection made simpler by eliminating heuristic
|
||||
nms,'' \emph{IEEE Transactions on Multimedia}, vol.~25, pp. 9254--9262, 2023.
|
||||
|
||||
\bibitem{adam}
|
||||
D.~P. Kingma, ``Adam: A method for stochastic optimization,'' \emph{arXiv
|
||||
preprint arXiv:1412.6980}, 2014.
|
||||
|
||||
\bibitem{dla}
|
||||
F.~Yu, D.~Wang, E.~Shelhamer, and T.~Darrell, ``Deep layer aggregation,'' in
|
||||
\emph{Proceedings of the IEEE conference on computer vision and pattern
|
||||
recognition}, 2018, pp. 2403--2412.
|
||||
|
||||
\bibitem{resa}
|
||||
T.~Zheng, H.~Fang, Y.~Zhang, W.~Tang, Z.~Yang, H.~Liu, and D.~Cai, ``Resa:
|
||||
Recurrent feature-shift aggregator for lane detection,'' in \emph{Proceedings
|
||||
of the AAAI conference on artificial intelligence}, vol.~35, no.~4, 2021, pp.
|
||||
3547--3554.
|
||||
|
||||
\bibitem{laneaf}
|
||||
H.~Abualsaud, S.~Liu, D.~B. Lu, K.~Situ, A.~Rangesh, and M.~M. Trivedi,
|
||||
``Laneaf: Robust multi-lane detection with affinity fields,'' \emph{IEEE
|
||||
Robotics and Automation Letters}, vol.~6, no.~4, pp. 7477--7484, 2021.
|
||||
|
||||
\bibitem{eigenlanes}
|
||||
D.~Jin, W.~Park, S.-G. Jeong, H.~Kwon, and C.-S. Kim, ``Eigenlanes: Data-driven
|
||||
lane descriptors for structurally diverse lanes,'' in \emph{Proceedings of
|
||||
the IEEE/CVF conference on computer vision and pattern recognition}, 2022,
|
||||
pp. 17\,163--17\,171.
|
||||
|
||||
\bibitem{enetsad}
|
||||
Y.~Hou, Z.~Ma, C.~Liu, and C.~C. Loy, ``Learning lightweight lane detection
|
||||
cnns by self attention distillation,'' in \emph{Proceedings of the IEEE/CVF
|
||||
international conference on computer vision}, 2019, pp. 1013--1021.
|
||||
|
||||
\bibitem{pointlanenet}
|
||||
Z.~Chen, Q.~Liu, and C.~Lian, ``Pointlanenet: Efficient end-to-end cnns for
|
||||
accurate real-time lane detection,'' in \emph{2019 IEEE intelligent vehicles
|
||||
symposium (IV)}.\hskip 1em plus 0.5em minus 0.4em\relax IEEE, 2019, pp.
|
||||
2563--2568.
|
||||
|
||||
\bibitem{iouloss}
|
||||
J.~Yu, Y.~Jiang, Z.~Wang, Z.~Cao, and T.~Huang, ``Unitbox: An advanced object
|
||||
detection network,'' in \emph{Proceedings of the 24th ACM international
|
||||
conference on Multimedia}, 2016, pp. 516--520.
|
||||
|
||||
\bibitem{giouloss}
|
||||
H.~Rezatofighi, N.~Tsoi, J.~Gwak, A.~Sadeghian, I.~Reid, and S.~Savarese,
|
||||
``Generalized intersection over union: A metric and a loss for bounding box
|
||||
regression,'' in \emph{Proceedings of the IEEE/CVF conference on computer
|
||||
vision and pattern recognition}, 2019, pp. 658--666.
|
||||
|
||||
\end{thebibliography}
|
66
main.tex
66
main.tex
@ -265,7 +265,7 @@ where $\boldsymbol{c}^{g} \in \mathbb{R}^{2}$ and $\boldsymbol{c}^{l}_{j} \in \m
|
||||
x_{i,j}^{s}&=-y_{i,j}^{s}\tan \theta _j+\frac{r_{j}^{g}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\boldsymbol{c}^g}{\cos \theta _j},\label{positions}\\
|
||||
i&=1,2,\cdots,N;j=1,2,\cdots,K,\notag
|
||||
\end{align}
|
||||
where the y-coordinates $\boldsymbol{y}_{j}\equiv \{y_{1,j},y_{2,j},\cdots ,y_{N,j}\}$ of the $j$-th lane anchor are uniformly sampled vertically from the image, as previously mentioned. The proof of Eqs. (\ref{l2g})-(\ref{positions}) can be found in Appendix \ref{proof_l2g}. Then coordinates of the $j$-th lane anchor can be given by $\boldsymbol{\ell}_j=\{\boldsymbol{x}_{j},\boldsymbol{y}_j\}\equiv \left\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots ,(x_{N,j}, y_{N,j})\right\}$.
|
||||
where the y-coordinates $\boldsymbol{y}_{j}\equiv \{y_{1,j},y_{2,j},\cdots ,y_{N,j}\}$ of the $j$-th lane anchor are uniformly sampled vertically from the image, as previously mentioned. The proof of Eqs. (\ref{l2g})-(\ref{positions}) can be found in Appendix \textcolor{red}{A}. Then coordinates of the $j$-th lane anchor can be given by $\boldsymbol{\ell}_j=\{\boldsymbol{x}_{j},\boldsymbol{y}_j\}\equiv \left\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots ,(x_{N,j}, y_{N,j})\right\}$.
|
||||
\par
|
||||
Given the different level feature maps $\boldsymbol{P}_1, \boldsymbol{P}_2, \boldsymbol{P}_3$ from FPN, we can extract the channel-wise features of each point corresponding to the positions of $\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots,(x_{N,j},y_{N,j})\}_{j=1}^{K}$, respectively denoted as $\boldsymbol{F}_{1,j}, \boldsymbol{F}_{2,j}, \boldsymbol{F}_{3,j}\in \mathbb{R} ^{N\times C_f}$. To enhance representation, similar to \cite{srlane}, we employ a weighted sum strategy to combine features from the three levels by:
|
||||
\begin{align}
|
||||
@ -279,7 +279,7 @@ Here, $\boldsymbol{W}_{pool}\in \mathbb{R} ^{d_r\times NC_f}$ is employed to fur
|
||||
\par
|
||||
\textbf{Triplet Head.} The lane detection head is to classify and regress the lane anchors generated from the LPM based on the ROI pooling features $\{\boldsymbol{F}_{i}^{roi}\in \mathbb{R} ^{d_r}\}_{i=1}^K$. As we know, traditional lane detection head\cite{laneatt} is usually equipped with a \textit{One-to-Many} (O2M) classification subhead and a \textit{One-to-Many} (O2M) regression subhead. However, the one-to-many mechanism (\textit{i.e.}, \textit{many candidates for one ground truth}) will cause redundant predictions for each lane, thus need the NMS post-processing operator. While the NMS is non-differentiable and non-end-to-end, resulting in the challenges of manually setting of hyperparameters and suboptimal of performance. To eliminate NMS post-processing while achieving end-to-end learning, we introduce a triplet head module for lane detection.
|
||||
\par
|
||||
As shown in Fig. \ref{gpm}, the triplet head consists of three components: the O2M classification, the O2M regression, and another \textit{One-to-One} (O2O) classification. The features of each lane anchor $\{\boldsymbol{F}_{j}^{roi}\}$ are fed into the aforementioned three subheads, respectively. To keep both simplicity and efficiency, both the O2M classification and O2M regression subheads apply two \textit{multi-layer perceptions} (MLPs) to $\{\boldsymbol{F}_{j}^{roi}\}$ and then generate the confidence scores $\left\{{s}_j^g\right\}$ by the O2M classification subhead and the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ by the O2M regression subhead for each lane anchor. More details about the O2M classification and O2M regression subheads can be referred to \cite{yolox}. The O2O classification subhead is introduced to generate non-redundant lane candidates within a NMS-free paradigm. However, the direct use of one-to-one strategy (\textit{i.e.}, \textit{assigning one positive anchor for one ground truth lane}) based on the extracted features will damage model's learning\cite{dualassign}\cite{yolov10}. Instead, the proposed O2O classification subhead considers both the \textit{confidence prior} $\left\{{s}_j^g\right\}$ of O2M classification subhead and the \textit{spatial geometric prior} of the polar parameters (\textit{i.e.}, the angle $\theta$ and the radius $r$), and apply these priors to adaptively refine the lane anchor features $\{\boldsymbol{F}_{j}^{roi}\}$, which generates the refined lane anchor features $\{\boldsymbol{D}_{j}^{roi}\}$ and the confidence scores $\left\{\tilde{s}_j^g\right\}$. The structural design draws inspiration from the Fast NMS \cite{yolact}, with further particulars accessible in the Appendix \ref{NMS_appendix}.
|
||||
As shown in Fig. \ref{gpm}, the triplet head consists of three components: the O2M classification, the O2M regression, and another \textit{One-to-One} (O2O) classification. The features of each lane anchor $\{\boldsymbol{F}_{j}^{roi}\}$ are fed into the aforementioned three subheads, respectively. To keep both simplicity and efficiency, both the O2M classification and O2M regression subheads apply two \textit{multi-layer perceptions} (MLPs) to $\{\boldsymbol{F}_{j}^{roi}\}$ and then generate the confidence scores $\left\{{s}_j^g\right\}$ by the O2M classification subhead and the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ by the O2M regression subhead for each lane anchor. More details about the O2M classification and O2M regression subheads can be referred to \cite{yolox}. The O2O classification subhead is introduced to generate non-redundant lane candidates within a NMS-free paradigm. However, the direct use of one-to-one strategy (\textit{i.e.}, \textit{assigning one positive anchor for one ground truth lane}) based on the extracted features will damage model's learning\cite{dualassign}\cite{yolov10}. Instead, the proposed O2O classification subhead considers both the \textit{confidence prior} $\left\{{s}_j^g\right\}$ of O2M classification subhead and the \textit{spatial geometric prior} of the polar parameters (\textit{i.e.}, the angle $\theta$ and the radius $r$), and apply these priors to adaptively refine the lane anchor features $\{\boldsymbol{F}_{j}^{roi}\}$, which generates the refined lane anchor features $\{\boldsymbol{D}_{j}^{roi}\}$ and the confidence scores $\left\{\tilde{s}_j^g\right\}$. The structural design draws inspiration from the Fast NMS \cite{yolact}, with further particulars accessible in Appendix \textcolor{red}{B}.
|
||||
\par
|
||||
More specifically, the O2O classification subhead first calculates the \textit{semantic distance} between the $i$-th anchor with its x-coordinate $\boldsymbol{x}_{i}$ and the $j$-th anchor with its x-coordinate $\boldsymbol{x}_{j}$ as follows:
|
||||
\begin{align}
|
||||
@ -326,9 +326,9 @@ As stated above, the O2O classification subhead is formed from Eqs. (\ref{edge_l
|
||||
\end{align}
|
||||
where $\tau_{o2o}$ and $\tau_{o2m}$ are two confidence thresholds. The $\Omega^{pos}$ can allow for non-redundant positive predictions without NMS post-processing as the O2O classification subhead enhances the confidence score variability among similar anchors, making it less sensitive to the two confidence thresholds.
|
||||
\par
|
||||
\textbf{Loss function for GPM.} After obtaining the positive candidate set $\Omega^{pos}$ for the O2O classification subhead, the Hungarian algorithm \cite{detr} is applied to perform label assignment, \textit{i.e.}, a one-to-one assignment between the positive anchors and the ground truth instances. As for the O2M classification and O2M regression subheads, we use the same approach as in SimOTA \cite{yolox} for label assignment. More details about label assignment and cost function can be found in Appendix \ref{giou_appendix} and \ref{assign_appendix}. In the training, the Focal loss \cite{focal} is applied for both O2O classification subhead and the O2M classification subhead, respectively denoted as $\mathcal{L}^{o2o}_{cls}$ and $\mathcal{L}^{o2m}_{cls}$. Furthermore, we adopt the rank loss $\mathcal{L}_{rank}$ \cite{pss} to amplify the disparity between the positive and negative confidences of the O2O classification subhead. Note that, similar to \cite{pss}, we stop the gradient flow from the O2O classification subhead during the training stage to preserve the quality of RoI feature learning.
|
||||
\textbf{Loss function for GPM.} After obtaining the positive candidate set $\Omega^{pos}$ for the O2O classification subhead, the Hungarian algorithm \cite{detr} is applied to perform label assignment, \textit{i.e.}, a one-to-one assignment between the positive anchors and the ground truth instances. As for the O2M classification and O2M regression subheads, we use the same approach as in SimOTA \cite{yolox} for label assignment. More details about label assignment and cost function can be found in Appendix \textcolor{red}{C} and \textcolor{red}{D}. In the training, the Focal loss \cite{focal} is applied for both O2O classification subhead and the O2M classification subhead, respectively denoted as $\mathcal{L}^{o2o}_{cls}$ and $\mathcal{L}^{o2m}_{cls}$. Furthermore, we adopt the rank loss $\mathcal{L}_{rank}$ \cite{pss} to amplify the disparity between the positive and negative confidences of the O2O classification subhead. Note that, similar to \cite{pss}, we stop the gradient flow from the O2O classification subhead during the training stage to preserve the quality of RoI feature learning.
|
||||
|
||||
To train the O2M regression subhead, we have redefined the GIoU concept (refer to Appendix \ref{giou_appendix} for more details) and adopt the GIoU loss $\mathcal{L}_{GIoU}^{o2m}$ to regress the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ for each positive lane anchor. The end points of lanes are trained with a $Smooth_{L1}$ loss $\mathcal{L}_{end}^{o2m}$. In addition, we propose an auxiliary loss $\mathcal{L}_{aux}$ to facilitate the learning of global features. As illustrated in Fig. \ref{auxloss}, the anchors and ground truth are divided into several segments, with each anchor segment being regressed to the primary components of the corresponding segment of the ground truth. The auxiliary loss $\mathcal{L}_{aux}$ helps the detection head gain a deeper understanding of the global geometric structure and the auxiliary regression branch is dropped during the evaluation stage. Finally, the classification loss $\mathcal{L} _{cls}^{g}$ and the regression loss $\mathcal{L} _{reg}^{g}$ for GPM are given as follows:
|
||||
To train the O2M regression subhead, we have redefined the GIoU concept (refer to Appendix \textcolor{red}{C} for more details) and adopt the GIoU loss $\mathcal{L}_{GIoU}^{o2m}$ to regress the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ for each positive lane anchor. The end points of lanes are trained with a $Smooth_{L1}$ loss $\mathcal{L}_{end}^{o2m}$. In addition, we propose an auxiliary loss $\mathcal{L}_{aux}$ to facilitate the learning of global features. As illustrated in Fig. \ref{auxloss}, the anchors and ground truth are divided into several segments, with each anchor segment being regressed to the primary components of the corresponding segment of the ground truth. The auxiliary loss $\mathcal{L}_{aux}$ helps the detection head gain a deeper understanding of the global geometric structure and the auxiliary regression branch is dropped during the evaluation stage. Finally, the classification loss $\mathcal{L} _{cls}^{g}$ and the regression loss $\mathcal{L} _{reg}^{g}$ for GPM are given as follows:
|
||||
\begin{align}
|
||||
\mathcal{L} _{cls}^{g}&=w^{o2m}_{cls}\mathcal{L}^{o2m}_{cls}+w^{o2o}_{cls}\mathcal{L}^{o2o}_{cls}+w_{rank}\mathcal{L}_{rank},
|
||||
\\
|
||||
@ -373,7 +373,7 @@ For Tusimple, the evaluation is formulated as follows:
|
||||
where $C_{clip}$ and $S_{clip}$ represent the number of correct points (predicted points within 20 pixels of the ground truth) and the ground truth points, respectively. If the accuracy exceeds 85\%, the prediction is considered correct. TuSimple also reports the \textit{False Positive Rate} ($\mathrm{FPR}=1-\mathrm{Precision}$) and \textit{False Negative Rate} ($\mathrm{FNR}=1-\mathrm{Recall}$) metrics.
|
||||
|
||||
\subsection{Implement Detail}
|
||||
All input images are cropped and resized to $800\times320$. Similar to \cite{clrnet}, we apply random affine transformations and random horizontal flips. For the optimization process, we use the AdamW \cite{adam} optimizer with a learning rate warm-up and a cosine decay strategy. The initial learning rate is set to 0.006. The number of sampled points and regression points for each lane anchor are set to 36 and 72, respectively. The power coefficient of cost function $\beta$ is set to 6. The training processing of the whole model (including LPM and GPM) is end-to-end just like \cite{adnet}\cite{srlane}. All the experiments are conducted on a single NVIDIA A100-40G GPU. To make our model simple, we only use CNN-based backbone, namely ResNet\cite{resnet} and DLA34\cite{dla}. Other details can be seen in Appendix \ref{vis_appendix}.
|
||||
All input images are cropped and resized to $800\times320$. Similar to \cite{clrnet}, we apply random affine transformations and random horizontal flips. For the optimization process, we use the AdamW \cite{adam} optimizer with a learning rate warm-up and a cosine decay strategy. The initial learning rate is set to 0.006. The number of sampled points and regression points for each lane anchor are set to 36 and 72, respectively. The power coefficient of cost function $\beta$ is set to 6. The training processing of the whole model (including LPM and GPM) is end-to-end just like \cite{adnet}\cite{srlane}. All the experiments are conducted on a single NVIDIA A100-40G GPU. To make our model simple, we only use CNN-based backbone, namely ResNet\cite{resnet} and DLA34\cite{dla}. Other details can be seen in Appendix \textcolor{red}{E}.
|
||||
|
||||
|
||||
\begin{table*}[htbp]
|
||||
@ -739,7 +739,7 @@ We also explore the stop-gradient strategy for the O2O classification subhead. A
|
||||
|
||||
\textbf{Ablation study on NMS-free block in dense scenarios.} Despite demonstrating the feasibility of replacing NMS with the O2O classification subhead in sparse scenarios, the shortcomings of NMS in dense scenarios remain. To investigate the performance of the NMS-free block in dense scenarios, we conduct experiments on the CurveLanes dataset, as detailed in Table \ref{aba_NMS_dense}.
|
||||
|
||||
In the traditional NMS post-processing \cite{clrernet}, the default IoU threshold is set to 50 pixels. However, this default setting may not always be optimal, especially in dense scenarios where some lane predictions might be erroneously eliminated. Lowering the IoU threshold increases recall but decreases precision. To find the most effective IoU threshold, we experimented with various values and found that a threshold of 15 pixels achieves the best trade-off, resulting in an F1-score of 86.81\%. In contrast, the NMS-free paradigm with the O2O classification subhead achieves an overall F1-score of 87.29\%, which is 0.48\% higher than the optimal threshold setting in the NMS paradigm. Additionally, both precision and recall are improved under the NMS-free approach. This indicates the O2O classification subhead with proposed GNN block is capable of learning both explicit geometric distance and implicit semantic distances between anchors, thus providing a more effective solution for dense scenarios compared to traditional NMS post-processing. More visualization outcomes can be seen in Appendix \ref{vis_appendix}.
|
||||
In the traditional NMS post-processing \cite{clrernet}, the default IoU threshold is set to 50 pixels. However, this default setting may not always be optimal, especially in dense scenarios where some lane predictions might be erroneously eliminated. Lowering the IoU threshold increases recall but decreases precision. To find the most effective IoU threshold, we experimented with various values and found that a threshold of 15 pixels achieves the best trade-off, resulting in an F1-score of 86.81\%. In contrast, the NMS-free paradigm with the O2O classification subhead achieves an overall F1-score of 87.29\%, which is 0.48\% higher than the optimal threshold setting in the NMS paradigm. Additionally, both precision and recall are improved under the NMS-free approach. This indicates the O2O classification subhead with proposed GNN block is capable of learning both explicit geometric distance and implicit semantic distances between anchors, thus providing a more effective solution for dense scenarios compared to traditional NMS post-processing. More visualization outcomes can be seen in Appendix \textcolor{red}{E}.
|
||||
|
||||
\section{Conclusion and Future Work}
|
||||
In this paper, we propose Polar R-CNN to address two key issues in anchor-based lane detection methods. By incorporating a local and global polar coordinate system, our Polar R-CNN achieves improved performance with fewer anchors. Additionally, the introduction of the O2O classification subhead with GNN block allows us to replace the traditional NMS post-processing, and the NMS-free paradigm demonstrates superior performance in dense scenarios. Our model is highly flexible and the number of anchors can be adjusted based on the specific scenario. Polar R-CNN is also deployment-friendly due to its simple structure, making it a potential new baseline for lane detection. Future work could explore new label assignment, anchor sampling strategies and complicated model structures, such as large kernels and attention mechanisms. We also plan to extend Polar R-CNN to video instance and 3D lane detection tasks, utilizing advanced geometric modeling techniques.
|
||||
@ -747,29 +747,29 @@ In this paper, we propose Polar R-CNN to address two key issues in anchor-based
|
||||
\bibliographystyle{IEEEtran}
|
||||
\bibliography{reference}
|
||||
% \newpage
|
||||
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/wsq.jpg}}]{Shengqi Wang}
|
||||
received the Master degree from Xi'an Jiaotong University, Xi'an, China, in 2022. He is now pursuing for the Ph.D. degree in statistics at Xi'an Jiaotong University. His research interests include low-level computer vision, deep learning, and so on.
|
||||
\end{IEEEbiography}
|
||||
% \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/wsq.jpg}}]{Shengqi Wang}
|
||||
% received the Master degree from Xi'an Jiaotong University, Xi'an, China, in 2022. He is now pursuing for the Ph.D. degree in statistics at Xi'an Jiaotong University. His research interests include low-level computer vision, deep learning, and so on.
|
||||
% \end{IEEEbiography}
|
||||
|
||||
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/ljm.pdf}}]{Junmin Liu}
|
||||
was born in 1982. He received the Ph.D. degree in Mathematics from Xi'an Jiaotong University, Xi'an, China, in 2013. From 2011 to 2012, he served as a Research Assistant with the Department of Geography and Resource Management at the Chinese University of Hong Kong, Hong Kong, China. From 2014 to 2017, he worked as a Visiting Scholar at the University of Maryland, College Park, USA. He is currently a full Professor at the School of Mathematics and Statistics, Xi'an Jiaotong University, Xi'an, China. His research interests are mainly focused on the theory and application of machine learning and image processing. He has published over 60+ research papers in international conferences and journals.
|
||||
\end{IEEEbiography}
|
||||
% \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/ljm.pdf}}]{Junmin Liu}
|
||||
% was born in 1982. He received the Ph.D. degree in Mathematics from Xi'an Jiaotong University, Xi'an, China, in 2013. From 2011 to 2012, he served as a Research Assistant with the Department of Geography and Resource Management at the Chinese University of Hong Kong, Hong Kong, China. From 2014 to 2017, he worked as a Visiting Scholar at the University of Maryland, College Park, USA. He is currently a full Professor at the School of Mathematics and Statistics, Xi'an Jiaotong University, Xi'an, China. His research interests are mainly focused on the theory and application of machine learning and image processing. He has published over 60+ research papers in international conferences and journals.
|
||||
% \end{IEEEbiography}
|
||||
|
||||
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/xiangyongcao.jpg}}]{Xiangyong Cao (Member, IEEE)}
|
||||
received the B.Sc. and Ph.D. degrees from Xi’an Jiaotong University, Xi’an, China, in 2012 and 2018, respectively. From 2016 to 2017, he was a Visiting Scholar with Columbia University, New York, NY, USA. He is an Associate Professor with the School of Computer Science and Technology, Xi’an Jiaotong University. His research interests include statistical modeling
|
||||
and image processing.
|
||||
\end{IEEEbiography}
|
||||
% \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/xiangyongcao.jpg}}]{Xiangyong Cao (Member, IEEE)}
|
||||
% received the B.Sc. and Ph.D. degrees from Xi’an Jiaotong University, Xi’an, China, in 2012 and 2018, respectively. From 2016 to 2017, he was a Visiting Scholar with Columbia University, New York, NY, USA. He is an Associate Professor with the School of Computer Science and Technology, Xi’an Jiaotong University. His research interests include statistical modeling
|
||||
% and image processing.
|
||||
% \end{IEEEbiography}
|
||||
|
||||
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/photo_ZengjieSong.jpg}}]{Zengjie Song}
|
||||
received the B.S. degree in applied mathematics and the Ph.D. degree in statistics from the Xi’an Jiaotong University (XJTU), Xi’an, China, in 2013 and 2020, respectively. From 2017 to 2018, he was a visiting Ph.D. student with the Department of Computer Science, University of Illinois at Urbana–Champaign, Champaign, IL, USA. From 2020 to 2023, he was a Postdoctoral Researcher with the Institute of Automation, Chinese Academy of Sciences, Beijing, China. In May 2023, he returned to the XJTU, where he is currently an Assistant Professor with the Department of Statistics. His research interests include predictive coding, multimodal learning, generative model, and computer vision, with an emphasis on the intersection of machine learning and computational neuroscience.
|
||||
\end{IEEEbiography}
|
||||
% \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/photo_ZengjieSong.jpg}}]{Zengjie Song}
|
||||
% received the B.S. degree in applied mathematics and the Ph.D. degree in statistics from the Xi’an Jiaotong University (XJTU), Xi’an, China, in 2013 and 2020, respectively. From 2017 to 2018, he was a visiting Ph.D. student with the Department of Computer Science, University of Illinois at Urbana–Champaign, Champaign, IL, USA. From 2020 to 2023, he was a Postdoctoral Researcher with the Institute of Automation, Chinese Academy of Sciences, Beijing, China. In May 2023, he returned to the XJTU, where he is currently an Assistant Professor with the Department of Statistics. His research interests include predictive coding, multimodal learning, generative model, and computer vision, with an emphasis on the intersection of machine learning and computational neuroscience.
|
||||
% \end{IEEEbiography}
|
||||
|
||||
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/sunkai.jpg}}]{Kai Sun}
|
||||
received his Ph.D. degree in statistics from Xi'an Jiaotong University, Xi'an, China, in 2020. He jointed Xi'an Jiaotong University, China, in 2020, where he is currently an associate professor in School of Mathematics and Statistics. His research interests include deep learning and image processing. Up to now, he has authored and coauthored one monograph and 20+ academic papers, primarily in journals such as IEEE TIP, IEEE TNNLS and others. Additionally, he has published one ESI highly cited paper and ESI hot paper as the first author.
|
||||
\end{IEEEbiography}
|
||||
\vfill
|
||||
|
||||
\newpage
|
||||
% \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/sunkai.jpg}}]{Kai Sun}
|
||||
% received his Ph.D. degree in statistics from Xi'an Jiaotong University, Xi'an, China, in 2020. He jointed Xi'an Jiaotong University, China, in 2020, where he is currently an associate professor in School of Mathematics and Statistics. His research interests include deep learning and image processing. Up to now, he has authored and coauthored one monograph and 20+ academic papers, primarily in journals such as IEEE TIP, IEEE TNNLS and others. Additionally, he has published one ESI highly cited paper and ESI hot paper as the first author.
|
||||
% \end{IEEEbiography}
|
||||
% \vfill
|
||||
% \newpage
|
||||
\clearpage
|
||||
% 附录有多个section时
|
||||
\enablecitations
|
||||
|
||||
@ -850,7 +850,7 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
|
||||
The positive corresponding anchors, $\left\{ \theta _i,r_{i}^{g} \right\} |_{i=1}^{K}$;\\
|
||||
The confidence emanating from the O2M classification subhead, $s_i^g$;\\
|
||||
The regressions emanating from the O2M regression subhead, denoted as $\left\{ Lane_i \right\} |_{i=1}^{K}$\\
|
||||
The predetermined thresholds $\tau^\theta$, $\tau^r$, $\tau_d$ and $\lambda _{o2m}^{s}$.
|
||||
The predetermined thresholds $\tau^\theta$, $\lambda^g$, $\tau_d$ and $\tau_{o2m}$.
|
||||
\ENSURE ~~\\ %算法的输出:Output
|
||||
\STATE Calculate the confidence-prior adjacency matrix $\boldsymbol{A}^{C}\in\mathbb{R}^{K\times K}$, defined as follows:
|
||||
\begin{align}
|
||||
@ -863,7 +863,7 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
|
||||
\STATE Calculate the geometric-prior adjacency matrix $\boldsymbol{A}^{G}\in\mathbb{R}^{K\times K}$, which is defined as follows:
|
||||
\begin{align}
|
||||
A_{ij}^{G}=\begin{cases}
|
||||
1,\, \mathrm{if}\,\, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\tau^r;\\
|
||||
1,\, \mathrm{if}\,\, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\lambda^g;\\
|
||||
0,\, \mathrm{others}.\\
|
||||
\end{cases}
|
||||
\label{geometric prior matrix}
|
||||
@ -896,12 +896,12 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
|
||||
|
||||
The new algorithm possesses a distinctly different format from its predecessor\cite{yolact}. We introduce a geometric-prior adjacency matrix characterized by $\boldsymbol{A}^G$, alleviating the suppression relationship between disparate anchors. It is manifestly to demonstrate that, when all elements within $\boldsymbol{A}^{G}$ are all set as $1$ (\textit{i.e.}, disregarding geometric priors), Algorithm \ref{Graph Fast NMS} is equivalent to Fast NMS. Building upon our newly proposed sort-free Fast NMS with geometric prior, we design the structure of the one-to-one classification head.
|
||||
|
||||
The principal limitations of the NMS lie in two steps, namely the definitions of distance stem from geometry (\textit{i.e.}, Eq. (\ref{al_1-3})) and the threshold $\lambda^{g}$ employed to eliminate redundant predictions (\textit{i.e.}, Eq. (\ref{al_1-4})). For instance, in the scenarios involving double lines, despite the minimal geometric distance between the two lane instances, their semantic divergence is remarkably pronounced. Consequently, we replace the aforementioned two steps with trainable neural networks, allowing them to alleviate the limitation of Fast NMS in a data-driven fashion. The neural network blocks to replace Eq. (\ref{al_1-3}) are Eqs. (\ref{edge_layer_1})-(\ref{edge_layer_3}) in the main text.
|
||||
The principal limitations of the NMS lie in two steps, namely the definitions of distance stem from geometry (\textit{i.e.}, Eq. (\ref{al_1-3})) and the threshold employed to eliminate redundant predictions (\textit{i.e.}, Eq. (\ref{al_1-4})). For instance, in the scenarios involving double lines, despite the minimal geometric distance between the two lane instances, their semantic divergence is remarkably pronounced. Consequently, we replace the aforementioned two steps with trainable neural networks, allowing them to alleviate the limitation of Fast NMS in a data-driven fashion. The neural network blocks to replace Eq. (\ref{al_1-3}) are Eqs. (\ref{edge_layer_1})-(\ref{edge_layer_3}) in the main text.
|
||||
|
||||
In Eq. (\ref{edge_layer_3}), the inverse distance $\boldsymbol{D}_{ij}^{edge}\in\mathbb{R}^{d_n}$ transcends its scalar form, encapsulating the semantic distance between predictions.
|
||||
We use element-wise max pooling for the tensor, as the repalcement of the max operation applied to scalar, as delineated in Eq. (\ref{maxpooling}) in the main text. Furthermore, the predetermined $\left( \tau ^d \right) ^{-1}$ is no longer utilized as the threshold of the distance. We defined a neural work as the implicit decision plane to formulate the final score $\tilde{s}_{i}^{g}$, as defined in Eq. (\ref{node_layer}), serving as the replacement of Eq. (\ref{al_1-4}).
|
||||
|
||||
The score $\tilde{s}_{i}^{g}$ output by the neural network transitions from a binary score to a continuous soft score ranging from 0 to 1. We introduce a new threshold $\lambda^s_{o2o}$ within the updated criteria of Eq. (\ref{al_1-5}):
|
||||
The score $\tilde{s}_{i}^{g}$ output by the neural network transitions from a binary score to a continuous soft score ranging from 0 to 1. We introduce a new threshold $\tau_{o2o}$ within the updated criteria of Eq. (\ref{al_1-5}):
|
||||
\begin{align}
|
||||
\varOmega_{nms-free}^{pos}=\left\{i|\tilde{s}_{i}^{g}>\tau_{o2o} \right\} \cap \left\{ i|s_{i}^{g}>\tau_{o2m} \right\}.
|
||||
\end{align}
|
||||
@ -970,7 +970,7 @@ To ensure the IoU between lane instances aligns with the conventions of general
|
||||
where $w^{b}$ is the base semi-width parameter and $w_{i,p}$ is the actual semi-width of $p$-th lane instance. The sets $\left\{ b_{i,p}^{l} \right\} _{i=1}^{N}$ and $\left\{ b_{i,p}^{r} \right\} _{i=1}^{N}$ signify the left and right boundaries of the $p$-th lane instance. Subsequently, we defined inter and union between lane instances:
|
||||
\begin{align}
|
||||
d_{i,pq}^{\mathcal{O}}&=\max \left( \min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\
|
||||
d_{i,pq}^{\xi}&=\max \left( \max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\
|
||||
d_{i,pq}^{\xi}&=\max \left( \max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) -\min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) , 0 \right),\\
|
||||
d_{i,pq}^{\mathcal{U}}&=\max \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right).
|
||||
\end{align}
|
||||
The defination of $\left\{d_{i,pq}^{\mathcal{O}}\right\}_{i=1}^{N}$, $\left\{d_{i,pq}^{\xi}\right\}_{i=1}^{N}$ and $\left\{d_{i,pq}^{\mathcal{U}}\right\}_{i=1}^{N}$ denote the over distance, gap distance, and union distance, respectively. These definitions closely resemble but slightly differ from those in \cite{clrnet} and \cite{adnet}, modifications to ensure non-negative values. This formulation aims to maintain consistency with the IoU definitions used for bounding boxes. Thus, the overall GLaneIoU between the $p$-th and $q$-th lane instances is expressed as:
|
||||
@ -1242,12 +1242,6 @@ In the one-to-many label assignment, we simply use SimOTA \cite{yolox}, which al
|
||||
Given the ground truth label generated by the label assignment strategy for each prediction, we can conduct the loss function during phase. As illustrated in Fig. \ref{head_assign}, $\mathcal{L}_{cls}^{o2o}$ and $\mathcal{L}_{rank}$ are for the O2O classification subhead, $\mathcal{L}_{cls}^{o2m}$ is for the O2M classification subhead whereas $\mathcal{L}_{GIOU}$ (with $g=1$), $\mathcal{L}_{end}$ and $\mathcal{L}_{aux}$ for the O2M regression subhead.
|
||||
\label{assign_appendix}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
\section{The Supplement of Implement Detail and Visualization Results.}
|
||||
Some important implement details for each dataset are shown in Table \ref{dataset_info}. It includes the dataset information we employed to conduct experiments and visualizations, the parameters for data processing as well as hyperparameters of Polar R-CNN.
|
||||
|
||||
|
99
main2.bbl
Normal file
99
main2.bbl
Normal file
@ -0,0 +1,99 @@
|
||||
% Generated by IEEEtran.bst, version: 1.14 (2015/08/26)
|
||||
\begin{thebibliography}{10}
|
||||
\providecommand{\url}[1]{#1}
|
||||
\csname url@samestyle\endcsname
|
||||
\providecommand{\newblock}{\relax}
|
||||
\providecommand{\bibinfo}[2]{#2}
|
||||
\providecommand{\BIBentrySTDinterwordspacing}{\spaceskip=0pt\relax}
|
||||
\providecommand{\BIBentryALTinterwordstretchfactor}{4}
|
||||
\providecommand{\BIBentryALTinterwordspacing}{\spaceskip=\fontdimen2\font plus
|
||||
\BIBentryALTinterwordstretchfactor\fontdimen3\font minus
|
||||
\fontdimen4\font\relax}
|
||||
\providecommand{\BIBforeignlanguage}[2]{{%
|
||||
\expandafter\ifx\csname l@#1\endcsname\relax
|
||||
\typeout{** WARNING: IEEEtran.bst: No hyphenation pattern has been}%
|
||||
\typeout{** loaded for the language `#1'. Using the pattern for}%
|
||||
\typeout{** the default language instead.}%
|
||||
\else
|
||||
\language=\csname l@#1\endcsname
|
||||
\fi
|
||||
#2}}
|
||||
\providecommand{\BIBdecl}{\relax}
|
||||
\BIBdecl
|
||||
|
||||
\bibitem{detr}
|
||||
N.~Carion, F.~Massa, G.~Synnaeve, N.~Usunier, A.~Kirillov, and S.~Zagoruyko,
|
||||
``End-to-end object detection with transformers,'' in \emph{European
|
||||
conference on computer vision}.\hskip 1em plus 0.5em minus 0.4em\relax
|
||||
Springer, 2020, pp. 213--229.
|
||||
|
||||
\bibitem{learnNMS}
|
||||
J.~Hosang, R.~Benenson, and B.~Schiele, ``Learning non-maximum suppression,''
|
||||
in \emph{Proceedings of the IEEE conference on computer vision and pattern
|
||||
recognition}, 2017, pp. 4507--4515.
|
||||
|
||||
\bibitem{yolov10}
|
||||
A.~Wang, H.~Chen, L.~Liu, K.~Chen, Z.~Lin, J.~Han, and G.~Ding, ``Yolov10:
|
||||
Real-time end-to-end object detection,'' \emph{arXiv preprint
|
||||
arXiv:2405.14458}, 2024.
|
||||
|
||||
\bibitem{o2o}
|
||||
P.~Sun, Y.~Jiang, E.~Xie, W.~Shao, Z.~Yuan, C.~Wang, and P.~Luo, ``What makes
|
||||
for end-to-end object detection?'' in \emph{International Conference on
|
||||
Machine Learning}.\hskip 1em plus 0.5em minus 0.4em\relax PMLR, 2021, pp.
|
||||
9934--9944.
|
||||
|
||||
\bibitem{o3d}
|
||||
J.~Wang, L.~Song, Z.~Li, H.~Sun, J.~Sun, and N.~Zheng, ``End-to-end object
|
||||
detection with fully convolutional network,'' in \emph{Proceedings of the
|
||||
IEEE/CVF conference on computer vision and pattern recognition}, 2021, pp.
|
||||
15\,849--15\,858.
|
||||
|
||||
\bibitem{relationnet}
|
||||
H.~Hu, J.~Gu, Z.~Zhang, J.~Dai, and Y.~Wei, ``Relation networks for object
|
||||
detection,'' in \emph{Proceedings of the IEEE conference on computer vision
|
||||
and pattern recognition}, 2018, pp. 3588--3597.
|
||||
|
||||
\bibitem{yolact}
|
||||
D.~Bolya, C.~Zhou, F.~Xiao, and Y.~J. Lee, ``Yolact: Real-time instance
|
||||
segmentation,'' in \emph{Proceedings of the IEEE/CVF international conference
|
||||
on computer vision}, 2019, pp. 9157--9166.
|
||||
|
||||
\bibitem{iouloss}
|
||||
J.~Yu, Y.~Jiang, Z.~Wang, Z.~Cao, and T.~Huang, ``Unitbox: An advanced object
|
||||
detection network,'' in \emph{Proceedings of the 24th ACM international
|
||||
conference on Multimedia}, 2016, pp. 516--520.
|
||||
|
||||
\bibitem{giouloss}
|
||||
H.~Rezatofighi, N.~Tsoi, J.~Gwak, A.~Sadeghian, I.~Reid, and S.~Savarese,
|
||||
``Generalized intersection over union: A metric and a loss for bounding box
|
||||
regression,'' in \emph{Proceedings of the IEEE/CVF conference on computer
|
||||
vision and pattern recognition}, 2019, pp. 658--666.
|
||||
|
||||
\bibitem{clrnet}
|
||||
T.~Zheng, Y.~Huang, Y.~Liu, W.~Tang, Z.~Yang, D.~Cai, and X.~He, ``Clrnet:
|
||||
Cross layer refinement network for lane detection,'' in \emph{Proceedings of
|
||||
the IEEE/CVF conference on computer vision and pattern recognition}, 2022,
|
||||
pp. 898--907.
|
||||
|
||||
\bibitem{adnet}
|
||||
L.~Xiao, X.~Li, S.~Yang, and W.~Yang, ``Adnet: Lane shape prediction via anchor
|
||||
decomposition,'' in \emph{Proceedings of the IEEE/CVF International
|
||||
Conference on Computer Vision}, 2023, pp. 6404--6413.
|
||||
|
||||
\bibitem{date}
|
||||
Y.~Chen, Q.~Chen, Q.~Hu, and J.~Cheng, ``Date: Dual assignment for end-to-end
|
||||
fully convolutional object detection,'' \emph{arXiv preprint
|
||||
arXiv:2211.13859}, 2022.
|
||||
|
||||
\bibitem{clrernet}
|
||||
H.~Honda and Y.~Uchida, ``Clrernet: improving confidence of lane detection with
|
||||
laneiou,'' in \emph{Proceedings of the IEEE/CVF Winter Conference on
|
||||
Applications of Computer Vision}, 2024, pp. 1176--1185.
|
||||
|
||||
\bibitem{yolox}
|
||||
G.~Zheng, L.~Songtao, W.~Feng, L.~Zeming, S.~Jian \emph{et~al.}, ``Yolox:
|
||||
Exceeding yolo series in 2021,'' \emph{arXiv preprint arXiv:2107.08430},
|
||||
2021.
|
||||
|
||||
\end{thebibliography}
|
539
main2.tex
Normal file
539
main2.tex
Normal file
@ -0,0 +1,539 @@
|
||||
\documentclass[lettersize,journal]{IEEEtran}
|
||||
\usepackage{amsmath,amsfonts}
|
||||
\usepackage{algorithmic}
|
||||
\usepackage{algorithm}
|
||||
\usepackage{array}
|
||||
% \usepackage[caption=false,font=normalsize,labelfont=sf,textfont=sf]{subfig}
|
||||
\usepackage{textcomp}
|
||||
\usepackage{stfloats}
|
||||
\usepackage{url}
|
||||
\usepackage{verbatim}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{cite}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{multirow}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{adjustbox}
|
||||
\usepackage{amssymb}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{tikz}
|
||||
\usepackage{tabularx}
|
||||
\usepackage{mathrsfs}
|
||||
\usepackage{etoolbox}
|
||||
|
||||
% 定义一个命令来禁用参考文献引用
|
||||
\newcommand{\disablecitations}{%
|
||||
\renewcommand{\cite}[1]{}%
|
||||
}
|
||||
|
||||
% 定义一个命令来恢复参考文献引用
|
||||
\newcommand{\enablecitations}{%
|
||||
\let\cite\oldcite%
|
||||
}
|
||||
|
||||
% 保存原始的 \cite 命令
|
||||
\let\oldcite\cite
|
||||
|
||||
\usepackage[colorlinks,bookmarksopen,bookmarksnumbered, linkcolor=red]{hyperref}
|
||||
\definecolor{darkgreen}{RGB}{17,159,27} %
|
||||
\aboverulesep=0pt
|
||||
\belowrulesep=0pt
|
||||
\hyphenation{op-tical net-works semi-conduc-tor IEEE-Xpolare}
|
||||
% updated with editorial comments 8/9/2021
|
||||
% \renewcommand{\includegraphics}[2][]{} % 重定义 \includegraphics 命令为空操作
|
||||
\begin{document}
|
||||
\disablecitations
|
||||
\enablecitations
|
||||
|
||||
\title{Appendix and Supplementary Materials}
|
||||
|
||||
\markboth{Appendix and Supplementary Materials}%
|
||||
{Appendix and Supplementary Materials}
|
||||
\maketitle
|
||||
\begin{appendices}
|
||||
\setcounter{table}{0} %从0开始编号,显示出来表会A1开始编号
|
||||
\setcounter{figure}{0}
|
||||
\setcounter{section}{0}
|
||||
\setcounter{equation}{0}
|
||||
\renewcommand{\thetable}{A\arabic{table}}
|
||||
\renewcommand{\thefigure}{A\arabic{figure}}
|
||||
\renewcommand{\thesection}{A\arabic{section}}
|
||||
\renewcommand{\theequation}{A\arabic{equation}}
|
||||
\addcontentsline{toc}{section}{Appendix} % 如果需要将附录标题添加到目录中
|
||||
\section{Details about the Coordinate Systems}
|
||||
In this section, we introduce the details about the coordinate systems employed in our model and coordinate transformations between them.
|
||||
For convenience, we adopted Cartesian coordinate system instead of the image coordinate system, wherein the y-axis is oriented from bottom to top and the x-axis from left to right. The coordinates of of the local poles $\left\{\boldsymbol{c}^l_i\right\}$, the global pole $\boldsymbol{c}^g$, and the sampled points $\{(x_{1,j}^s,y_{1,j}^s),(x_{2,j}^s,y_{2,j}^s),\cdots,(x_{N,j}^s,y_{N,j}^s)\}_{j=1}^{K}$ of anchors are all within this coordinate by default.
|
||||
|
||||
We now furnish the derivation of the transformations between different coordinate systems, with the crucial symbols elucidated in Fig. \ref{elu_proof}. These geometric transformations can be demonstrated with Analytic geometry theory in Euclidean space. The derivation of local to global polar coordinate system is presented as follows:
|
||||
\begin{align}
|
||||
r_{j}^{g}&=\left\| \overrightarrow{c^gh_{j}^{g}} \right\| =\left\| \overrightarrow{h_{j}^{a}h_{j}^{l}} \right\| =\left\| \overrightarrow{h_{j}^{a}h_{j}^{l}} \right\| \notag\\
|
||||
&=\left\| \overrightarrow{c_{j}^{l}h_{j}^{l}}-\overrightarrow{h_{j}^{a}c_{j}^{l}} \right\| =\left\| \overrightarrow{c_{j}^{l}h_{j}^{l}} \right\| -\left\| \overrightarrow{c_{j}^{l}h_{j}^{a}} \right\| \notag\\
|
||||
&=\left\| \overrightarrow{c_{j}^{l}h_{j}^{l}} \right\| - \frac{\overrightarrow{c_{j}^{l}h_{j}^{a}}}{\left\| \overrightarrow{c_{j}^{l}h_{j}^{a}} \right\|}\cdot \overrightarrow{c_{j}^{l}h_{j}^{a}} =\left\| \overrightarrow{c_{j}^{l}h_{j}^{l}} \right\| +\frac{\overrightarrow{c_{j}^{l}h_{j}^{a}}}{\left\| \overrightarrow{c_{j}^{l}h_{j}^{a}} \right\|}\cdot \overrightarrow{c^gc_{j}^{l}} \notag\\
|
||||
&=r_{j}^{l}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\left( \boldsymbol{c}_{j}^{l}-\boldsymbol{c}^g \right),
|
||||
\label{proof_l2g}
|
||||
\end{align}
|
||||
where $h_j^l$, $h_j^g$ and $h_j^a$ represent the foots of their respective perpendiculars in Fig. \ref{elu_proof}.
|
||||
Analogously, the derivation of sampling points along a lane anchor is provided as follows:
|
||||
\begin{align}
|
||||
&\overrightarrow{c^gp_{i,j}^{s}}\cdot \overrightarrow{c^gh_{j}^{g}}=\overrightarrow{c^gh_{j}^{g}}\cdot \overrightarrow{c^gh_{j}^{g}} \notag\\
|
||||
\Rightarrow &\overrightarrow{c^gp_{i,j}^{s}}\cdot \overrightarrow{c^gh_{j}^{g}}=\left\| \overrightarrow{c^gh_{j}^{g}} \right\| \left\| \overrightarrow{c^gh_{j}^{g}} \right\| \notag\\
|
||||
\Rightarrow &\frac{\overrightarrow{c^gh_{j}^{g}}}{\left\| \overrightarrow{c^gh_{j}^{g}} \right\|}\cdot \overrightarrow{c^gp_{i,j}^{s}}=\left\| \overrightarrow{c^gh_{j}^{g}} \right\| \notag\\
|
||||
\Rightarrow &\left[ \cos \theta _j;\sin \theta _j \right] ^T\left( \boldsymbol{p}_{i,j}^{s}-\boldsymbol{c}^g \right) =r_{j}^{g}\notag\\
|
||||
\Rightarrow &x_{i,j}^{s}\cos \theta _j+y_{i,j}^{s}\sin \theta _j=r_{j}^{g}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\boldsymbol{c}^g \notag\\
|
||||
\Rightarrow &x_{i,j}^{s}=-y_{i,j}^{s}\tan \theta _j+\frac{r_{j}^{g}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\boldsymbol{c}^g}{\cos \theta _j},
|
||||
\label{proof_sample}
|
||||
\end{align}
|
||||
where $p_{i,j}^{s}$ represents the $i$-th sampled point of the $j$-th lane anchor, whose coordinate is $\boldsymbol{p}_{i,j}^{s}\equiv(x_{i,j}^s, y_{i,j}^s)$.
|
||||
|
||||
\label{appendix_coord}
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{thesis_figure/elu_proof.png}
|
||||
\caption{The symbols employed in the derivation of coordinate transformations across different coordinate systems.}
|
||||
\label{elu_proof}
|
||||
\end{figure}
|
||||
|
||||
\section{The Design Principles of the One-to-one classification Head}
|
||||
Two fundamental prerequisites of the NMS-free framework lie in the label assignment strategies and the head structures.
|
||||
|
||||
As for the label assignment strategy, previous work use one-to-many label assignments, which make the detection head make redundant predictions for one ground truth, resulting in the need of NMS post-processing. Thus, some works \cite{detr}\cite{learnNMS} proposed one-to-one label assignment such as Hungarian algorithm. This force the model to predict one positive sample for each lane.
|
||||
|
||||
However, directly using one-to-one label assignment damage the learning of the model, and structures such as MLPs and CNNs struggle to assimilate the ``one-to-one'' characteristics, resulting in the decreasing of performance compared to one-to-many label assignments with NMS post-processing\cite{yolov10}\cite{o2o}. Consider a trivial example: Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from the $i$-th anchor, and the model is trained with one-to-one label assignment. Assuming that the $i$-th anchor and the $j$-th anchor are both close to the ground truth and overlap with each other. So the corresponding RoI features are similar, which can be expressed as follows:
|
||||
\begin{align}
|
||||
\boldsymbol{F}_{i}^{roi}\approx \boldsymbol{F}_{j}^{roi}.
|
||||
\end{align}
|
||||
Suppose that $\boldsymbol{F}^{roi}_{i}$ is assigned as a positive sample while $\boldsymbol{F}^{roi}_{j}$ as a negative sample, the ideal outcome should manifest as:
|
||||
\begin{align}
|
||||
f_{cls}\left( \boldsymbol{F}_{i}^{roi} \right) &\rightarrow 1, \notag\\
|
||||
f_{cls}\left( \boldsymbol{F}_{j}^{roi} \right) &\rightarrow 0,
|
||||
\label{sharp fun}
|
||||
\end{align}
|
||||
where $f_{cls}$ represents a classification head with an ordinary structure such as MLPs and CNNs. The Eq. (\ref{sharp fun}) implies that the property of $f_{cls}$ need to be ``sharp'' enough to differentiate between two similar features. In other words, the output of $f_{cls}$ changes rapidly over short periods or distances. This ``sharp'' pattern is hard to train for MLPs or CNNs solely. Consequently, additional new heuristic structures like \cite{o3d}\cite{relationnet} need to be developed.
|
||||
|
||||
We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classification subhead. Fast NMS serves as an iteration-free post-processing algorithm based on traditional NMS. Furthermore, we have incorporated a sort-free strategy along with geometric priors into Fast NMS, with the specifics delineated in Algorithm \ref{Graph Fast NMS}.
|
||||
|
||||
|
||||
|
||||
|
||||
\begin{algorithm}[t]
|
||||
\caption{Fast NMS with Geometric Prior.}
|
||||
\begin{algorithmic}[1] %这个1 表示每一行都显示数字
|
||||
\REQUIRE ~~\\ %算法的输入参数:Input
|
||||
The index of all anchors, $1, 2, \cdots, i, \cdots, K$;\\
|
||||
The positive corresponding anchors, $\left\{ \theta _i,r_{i}^{g} \right\} |_{i=1}^{K}$;\\
|
||||
The confidence emanating from the O2M classification subhead, $s_i^g$;\\
|
||||
The regressions emanating from the O2M regression subhead, denoted as $\left\{ Lane_i \right\} |_{i=1}^{K}$\\
|
||||
The predetermined thresholds $\tau^\theta$, $\lambda^g$, $\tau_d$ and $\tau_{o2m}$.
|
||||
\ENSURE ~~\\ %算法的输出:Output
|
||||
\STATE Calculate the confidence-prior adjacency matrix $\boldsymbol{A}^{C}\in\mathbb{R}^{K\times K}$, defined as follows:
|
||||
\begin{align}
|
||||
A_{ij}^{C}=\begin{cases}
|
||||
1,\, \mathrm{if}\,\, s_i>s_j\,\,or\,\,\left( s_i^g=s_j^g\,\,and\,\,i>j \right);\\
|
||||
0,\, \mathrm{others}.\\
|
||||
\end{cases}
|
||||
\label{confidential matrix}
|
||||
\end{align}
|
||||
\STATE Calculate the geometric-prior adjacency matrix $\boldsymbol{A}^{G}\in\mathbb{R}^{K\times K}$, which is defined as follows:
|
||||
\begin{align}
|
||||
A_{ij}^{G}=\begin{cases}
|
||||
1,\, \mathrm{if}\,\, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\lambda^g;\\
|
||||
0,\, \mathrm{others}.\\
|
||||
\end{cases}
|
||||
\label{geometric prior matrix}
|
||||
\end{align}
|
||||
\STATE Calculate the inverse distance matrix $\boldsymbol{D} \in \mathbb{R} ^{K \times K}$ The element $D_{ij}$ in $\boldsymbol{D}$ is defined as follows:
|
||||
\begin{align}
|
||||
D_{ij}=d^{-1}\left( Lane_i,Lane_j \right),
|
||||
\label{al_1-3}
|
||||
\end{align}
|
||||
where $d\left(\cdot, \cdot \right)$ is some predefined function to quantify the distance between two lane predictions such as IoU.
|
||||
\STATE Define the adjacent matrix $\boldsymbol{A} = \boldsymbol{A}^{C} \odot \boldsymbol{A}^{G}$ and the final confidence $\tilde{s}_i^g$ is calculate as following:
|
||||
\begin{align}
|
||||
\tilde{s}_{i}^{g}=\begin{cases}
|
||||
1,\, \mathrm{if}\,\, \mathrm{Max}\left(\mathcal{D}(:,j)|\boldsymbol{A}(:,j)=1\right)<\left( \tau ^d \right) ^{-1};\\
|
||||
0,\, \mathrm{others},\\
|
||||
\end{cases}
|
||||
\label{al_1-4}
|
||||
\end{align}
|
||||
where $j=1,2,\cdots,K$ and $\mathrm{Max}(\cdot|\boldsymbol{A}(:,j)=1)$ is a max operator along the $j$-th column of adjacency matrix $\boldsymbol{A}$ with the element $A_{:j}=1$.
|
||||
\STATE Get the final selection set:
|
||||
\begin{align}
|
||||
\varOmega_{nms}^{pos}=\left\{ i|\tilde{s}_{j}^{g}=1 \right\} \cap \left\{i|s_{i}^{g}>\tau_{o2m} \right\}.
|
||||
\label{al_1-5}
|
||||
\end{align}
|
||||
|
||||
\RETURN The final selection result $\varOmega_{nms}^{pos}$.
|
||||
\end{algorithmic}
|
||||
\label{Graph Fast NMS}
|
||||
\end{algorithm}
|
||||
|
||||
The new algorithm possesses a distinctly different format from its predecessor\cite{yolact}. We introduce a geometric-prior adjacency matrix characterized by $\boldsymbol{A}^G$, alleviating the suppression relationship between disparate anchors. It is manifestly to demonstrate that, when all elements within $\boldsymbol{A}^{G}$ are all set as $1$ (\textit{i.e.}, disregarding geometric priors), Algorithm \ref{Graph Fast NMS} is equivalent to Fast NMS. Building upon our newly proposed sort-free Fast NMS with geometric prior, we design the structure of the one-to-one classification head.
|
||||
|
||||
The principal limitations of the NMS lie in two steps, namely the definitions of distance stem from geometry (\textit{i.e.}, Eq. (\ref{al_1-3})) and the threshold employed to eliminate redundant predictions (\textit{i.e.}, Eq. (\ref{al_1-4})). For instance, in the scenarios involving double lines, despite the minimal geometric distance between the two lane instances, their semantic divergence is remarkably pronounced. Consequently, we replace the aforementioned two steps with trainable neural networks, allowing them to alleviate the limitation of Fast NMS in a data-driven fashion. The neural network blocks to replace Eq. (\ref{al_1-3}) are as follows:
|
||||
\begin{align}
|
||||
\widehat{\boldsymbol{F}}_{i}^{roi}&\gets \mathrm{ReLU}\left( \boldsymbol{W}_{roi}\boldsymbol{F}_{i}^{roi}+\boldsymbol{b}_{roi} \right), i=1,\cdots,K,\label{edge_layer_1}\\
|
||||
\boldsymbol{F}_{ij}^{edge}&\gets \boldsymbol{W}_{in}\widehat{\boldsymbol{F}}_{j}^{roi}-\boldsymbol{W}_{out}\widehat{\boldsymbol{F}}_{i}^{roi},\label{edge_layer_2}\\
|
||||
\boldsymbol{D}_{ij}^{edge}&\gets \mathrm{MLP}_{edge}\left(\boldsymbol{F}_{ij}^{edge}+\boldsymbol{W}_s\left( \boldsymbol{x}_{j}-\boldsymbol{x}_{i} \right) +\boldsymbol{b}_s \right).\label{edge_layer_3}
|
||||
\end{align}
|
||||
|
||||
In Eq. (\ref{edge_layer_3}), the inverse distance $\boldsymbol{D}_{ij}^{edge}\in\mathbb{R}^{d_n}$ transcends its scalar form, encapsulating the semantic distance between predictions. We use element-wise max pooling for the tensor, as the repalcement of the max operation applied to scalar. Furthermore, the predetermined $\left( \tau ^d \right) ^{-1}$ is no longer utilized as the threshold of the distance. We defined a neural work as the implicit decision plane to formulate the final score $\tilde{s}_{i}^{g}$
|
||||
\begin{align}
|
||||
\boldsymbol{D}_j^{roi}&\gets\mathrm{MPool}_{col}\left(\mathcal{D}^{edge}(:,j,:)|\boldsymbol{A}(:,j)=1\right), \label{maxpooling}\\
|
||||
\tilde{s}_{j}^{g}&\gets \mathrm{MLP}_{roi}\left( \boldsymbol{D}_{j}^{roi} \right), j=1,\cdots,K, \label{node_layer}
|
||||
\end{align}
|
||||
which serves as the replacement of Eq. (\ref{al_1-4}).
|
||||
|
||||
The score $\tilde{s}_{i}^{g}$ output by the neural network transitions from a binary score to a continuous soft score ranging from 0 to 1. We introduce a new threshold $\tau_{o2o}$ within the updated criteria of Eq. (\ref{al_1-5}):
|
||||
\begin{align}
|
||||
\varOmega_{nms-free}^{pos}=\left\{i|\tilde{s}_{i}^{g}>\tau_{o2o} \right\} \cap \left\{ i|s_{i}^{g}>\tau_{o2m} \right\}.
|
||||
\end{align}
|
||||
This criteria is also referred to as the \textit{dual confidence selection} in the main text.
|
||||
\label{NMS_appendix}
|
||||
|
||||
\begin{table*}[t]
|
||||
\centering
|
||||
\caption{Infos and hyperparameters for five datasets. For the CULane dataset, $*$ denotes the actual number of training samples used to train the model. Labels for some validation/test sets are missing and different splits (\textit{i.e.}, validation and test set) are selected for different datasets.}
|
||||
\begin{adjustbox}{width=\linewidth}
|
||||
\begin{tabular}{l|l|ccccc}
|
||||
\toprule
|
||||
\multicolumn{2}{c|}{\textbf{Dataset}} & CULane & TUSimple & LLAMAS & DL-Rail & CurveLanes \\
|
||||
\midrule
|
||||
\multirow{7}*{Dataset Description}
|
||||
& Train &88,880/$55,698^{*}$&3,268 &58,269&5,435&100,000\\
|
||||
& Validation &9,675 &358 &20,844&- &20,000 \\
|
||||
& Test &34,680&2,782 &20,929&1,569&- \\
|
||||
& Resolution &$1640\times590$&$1280\times720$&$1276\times717$&$1920\times1080$&$2560\times1440$, etc\\
|
||||
& Lane &$\leqslant4$&$\leqslant5$&$\leqslant4$&$=2$&$\leqslant10$\\
|
||||
& Environment &urban and highway & highway&highway&railay&urban and highway\\
|
||||
& Distribution &sparse&sparse&sparse&sparse&sparse and dense\\
|
||||
\midrule
|
||||
\multirow{2}*{Dataset Split}
|
||||
& Evaluation &Test&Test&Test&Test&Val\\
|
||||
& Visualization &Test&Test&Val&Test&Val\\
|
||||
\midrule
|
||||
\multirow{1}*{Data Preprocess}
|
||||
& Crop Height &270&160&300&560&640, etc\\
|
||||
\midrule
|
||||
\multirow{6}*{Training Hyperparameter}
|
||||
& Epoch Number &32&70&20&90&32\\
|
||||
& Batch Size &40&24&32&40&40\\
|
||||
& Warm up iterations &800&200&800&400&800\\
|
||||
& $w_{aux}$ &0.2&0 &0.2&0.2&0.2\\
|
||||
& $w_{rank}$ &0.7&0.7&0.1&0.7&0 \\
|
||||
\midrule
|
||||
\multirow{4}*{Evaluation Hyperparameter}
|
||||
& $H^{l}\times W^{l}$ &$4\times10$&$4\times10$&$4\times10$&$4\times10$&$6\times13$\\
|
||||
& $K$ &20&20&20&12&50\\
|
||||
& $d_n$ &5&8&10&5&5\\
|
||||
& $\tau_{o2m}$ &0.48&0.40&0.40&0.40&0.45\\
|
||||
& $\tau_{o2o}$ &0.46&0.46&0.46&0.46&0.44\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{adjustbox}
|
||||
\label{dataset_info}
|
||||
\end{table*}
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{thesis_figure/GLaneIoU.png} % 替换为你的图片文件名
|
||||
\caption{Illustrations of GLaneIoU redefined in our work.}
|
||||
\label{glaneiou}
|
||||
\end{figure}
|
||||
|
||||
\section{Details of Intersection Over Union between Lane Instances}
|
||||
To ensure the IoU between lane instances aligns with the conventions of general object detection methods \cite{iouloss}\cite{giouloss}, we have redefined the IoU of lane pairs. As depicted in Fig. \ref{glaneiou}, the newly defined IoU for lanes pairs, which we refer to as GLaneIoU, is elaborated as follows:
|
||||
\begin{align}
|
||||
\Delta x_{i,p}^{d}&=x_{i+1,p}^{d}-x_{i-1,p}^{d},\,\, \Delta y_{i,p}^{d}=y_{i+1,p}^{d}-y_{i-1,p}^{d}, \\
|
||||
w_{i,p}&=\frac{\sqrt{\left( \Delta x_{i,p}^{d} \right) ^2+\left( \Delta y_{i,p}^{d} \right) ^2}}{\Delta y_{i,p}^{d}}w^b,\\
|
||||
b_{i,p}^{l}&=x_{i,p}^{d}-w_{i,p},\,\, b_{i,p}^{r}=x_{i,p}^{d}+w_{i,p},
|
||||
\end{align}
|
||||
where $w^{b}$ is the base semi-width parameter and $w_{i,p}$ is the actual semi-width of $p$-th lane instance. The sets $\left\{ b_{i,p}^{l} \right\} _{i=1}^{N}$ and $\left\{ b_{i,p}^{r} \right\} _{i=1}^{N}$ signify the left and right boundaries of the $p$-th lane instance. Subsequently, we defined inter and union between lane instances:
|
||||
\begin{align}
|
||||
d_{i,pq}^{\mathcal{O}}&=\max \left( \min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\
|
||||
d_{i,pq}^{\xi}&=\max \left( \max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) -\min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) , 0 \right),\\
|
||||
d_{i,pq}^{\mathcal{U}}&=\max \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right).
|
||||
\end{align}
|
||||
The defination of $\left\{d_{i,pq}^{\mathcal{O}}\right\}_{i=1}^{N}$, $\left\{d_{i,pq}^{\xi}\right\}_{i=1}^{N}$ and $\left\{d_{i,pq}^{\mathcal{U}}\right\}_{i=1}^{N}$ denote the over distance, gap distance, and union distance, respectively. These definitions closely resemble but slightly differ from those in \cite{clrnet} and \cite{adnet}, modifications to ensure non-negative values. This formulation aims to maintain consistency with the IoU definitions used for bounding boxes. Thus, the overall GLaneIoU between the $p$-th and $q$-th lane instances is expressed as:
|
||||
\begin{align}
|
||||
GIoU\left( p,q \right)=\frac{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{O}}}}{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{U}}}}-g\frac{\sum\nolimits_{i=j}^k{d_{i,pq}^{\xi}}}{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{U}}}},
|
||||
\end{align}
|
||||
where j and k are the indices of the start point and the end point, respectively. It's evident that when $g=0$, the $GIoU$ for lane pairs corresponds to that for bounding box, with a value range of $\left[0, 1 \right]$. When $g=1$, the $GIoU$ for lane pairs corresponds to that for bounding box, with a value range of $\left(-1, 1 \right]$.
|
||||
|
||||
\label{giou_appendix}
|
||||
|
||||
\begin{figure}[t]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{thesis_figure/detection_head_assign.png}
|
||||
\caption{Label assignment and loss function for the triplet head.}
|
||||
\label{head_assign}
|
||||
\end{figure}
|
||||
\begin{figure*}[t]
|
||||
\centering
|
||||
\def\pagewidth{0.49\textwidth}
|
||||
\def\subwidth{0.47\linewidth}
|
||||
\def\imgwidth{\linewidth}
|
||||
\def\imgheight{0.5625\linewidth}
|
||||
\def\dashheight{0.8\linewidth}
|
||||
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\rotatebox{90}{\small{GT}}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/1_gt.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/2_gt.jpg}
|
||||
\end{minipage}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/1_gt.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/2_gt.jpg}
|
||||
\end{minipage}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\raisebox{-1.5em}{\rotatebox{90}{\small{Anchors}}}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/1_anchor.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/2_anchor.jpg}
|
||||
\end{minipage}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/1_anchor.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/2_anchor.jpg}
|
||||
\end{minipage}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\raisebox{-2em}{\rotatebox{90}{\small{Predictions}}}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/1_pred.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/2_pred.jpg}
|
||||
\end{minipage}
|
||||
\caption{CULane}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/1_pred.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/2_pred.jpg}
|
||||
\end{minipage}
|
||||
\caption{TuSimple}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
% \begin{tikzpicture}
|
||||
% \draw[dashed, pattern=on 8pt off 2pt, color=gray, line width=1pt] (-\textwidth/2,0) -- (\textwidth/2.,0);
|
||||
% \end{tikzpicture}
|
||||
% \vspace{0.05em}
|
||||
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\rotatebox{90}{\small{GT}}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/1_gt.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/2_gt.jpg}
|
||||
\end{minipage}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/1_gt.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/2_gt.jpg}
|
||||
\end{minipage}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\raisebox{-1.5em}{\rotatebox{90}{\small{Anchors}}}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/1_anchor.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/2_anchor.jpg}
|
||||
\end{minipage}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/1_anchor.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/2_anchor.jpg}
|
||||
\end{minipage}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\raisebox{-2em}{\rotatebox{90}{\small{Predictions}}}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/1_pred.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/2_pred.jpg}
|
||||
\end{minipage}
|
||||
\caption{LLAMAS}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\pagewidth}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/1_pred.jpg}
|
||||
\end{minipage}
|
||||
\begin{minipage}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/2_pred.jpg}
|
||||
\end{minipage}
|
||||
\caption{DL-Rail}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
\caption{Visualization of detection outcomes in sparse scenarios of four datasets.}
|
||||
\label{vis_sparse}
|
||||
\end{figure*}
|
||||
|
||||
\begin{figure*}[t]
|
||||
\centering
|
||||
\def\subwidth{0.24\textwidth}
|
||||
\def\imgwidth{\linewidth}
|
||||
\def\imgheight{0.5625\linewidth}
|
||||
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun_gt.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun_pred50.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun_pred15.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun_NMSfree.jpg}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun2_gt.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun2_pred50.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun2_pred15.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun2_NMSfree.jpg}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less_gt.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less_pred50.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less_pred15.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less_NMSfree.jpg}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less2_gt.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less2_pred50.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less2_pred15.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less2_NMSfree.jpg}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all_gt.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all_pred50.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all_pred15.jpg}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all_NMSfree.jpg}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all2_gt.jpg}
|
||||
\caption{GT}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all2_pred50.jpg}
|
||||
\caption{NMS@50}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all2_pred15.jpg}
|
||||
\caption{NMS@15}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{\subwidth}
|
||||
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all2_NMSfree.jpg}
|
||||
\caption{NMSFree}
|
||||
\end{subfigure}
|
||||
\vspace{0.5em}
|
||||
|
||||
\caption{Visualization of the detection outcomes in sparse and dense scenarios on the CurveLanes dataset.}
|
||||
\label{vis_dense}
|
||||
\end{figure*}
|
||||
|
||||
\section{Details about The Label assignment and Loss function.}
|
||||
Details about cost function and label assignments for the triplet head are furnished here. A dual label assignment strategy \cite{date} is employed for the triplet head, as illustrated in Fig. \ref{head_assign}. Specifically, we implement one-to-many label assignments for both the O2O classification subhead and the O2M regression subhead. This section closely aligns with previous work \cite{clrernet}. To endow our model with NMS-free paradigm, we additionally incorporate the O2O classification subhead and apply a one-to-one label assignment to it.
|
||||
|
||||
The cost metrics for both one-to-one and one-to-many label assignments are articulated as follows:
|
||||
\begin{align}
|
||||
\mathcal{C} _{p,q}^{o2o}=\tilde{s}_{p}^{g}\times \left( GIoU\left( p,q \right) \right) ^{\beta} \label{o2o_cost},\\
|
||||
\mathcal{C} _{p,q}^{o2m}=s_{p}^{g}\times \left( GIoU\left( p,q \right) \right) ^{\beta}, \label{o2m_cost}
|
||||
\end{align}
|
||||
where $\mathcal{C} _{pq}^{o2o}$ and $\mathcal{C} _{pq}^{o2m}$ denote the cost metric between $p$-th prediction and $q$-th ground truth and $g$ in $GIoU$ are set to $0$ to ensure it maintains non-negative. These metrics imply that both the confidence score and geometric distance contribute to the cost metrics.
|
||||
|
||||
Suppose that there exist $K$ predictions and $G$ ground truth. Let $\pi$ denotes the one-to-one label assignment strategy and $\pi(q)$ represent that the $\pi(q)$-th prediction is assigned to the $q$-th anchor. Additionally, $\mathscr{S}_{K, G}$ denotes the set of all possible one-to-one assignment strategies for K predictions and G ground truth. It's straightforward to demonstrate that the total number of one-to-one assignment strategies $\left| \mathscr{S} _{K,G} \right|$ is $\frac{K!}{\left( K-G \right)!}$. The final optimal assignment $\hat{\pi}$ is determined as follows:
|
||||
\begin{align}
|
||||
\hat{\pi}=\underset{\pi \in \mathscr{S}_{K,G}}{arg\max}\sum_{q=1}^G{\mathcal{C} _{\pi \left( q \right) ,q}^{o2o}}.
|
||||
\end{align}
|
||||
This assignment problem can be solved by Hungarian algorithm \cite{detr}. Finally, $G$ predictions are assigned as positive samples and $K-G$ predictions are assigned as negative samples.
|
||||
|
||||
In the one-to-many label assignment, we simply use SimOTA \cite{yolox}, which aligns with previous works \cite{clrernet}. Omitting the detailed process of SimOTA, we only introduce the inputs to it, namely the cost matrix $\boldsymbol{M}^C\in \mathbb{R}^{G\times K}$ and the IoU matrix $\boldsymbol{M}^{IoU}\in \mathbb{R}^{G\times K}$. The elements in the two matrices are defined as $M^C_{qp}=\mathcal{C} _{p,q}^{o2m}$ and $M^{IoU}_{qp}= GIoU\left( p,q \right)$ (with $g=0$), respectively. The number of assigned predictions for each ground truth is variable but does not exceed an upper bound $k_{dynamic}$, which is set to $4$ in our experiment. Finally, there are $K_{pos}$ positive samples and $K-K_{pos}$ negative samples, where $K_{pos}$ ranges from $0$ to $Gk_{dynamic}$.
|
||||
|
||||
Given the ground truth label generated by the label assignment strategy for each prediction, we can conduct the loss function during training phase. As illustrated in Fig. \ref{head_assign}, $\mathcal{L}_{cls}^{o2o}$ and $\mathcal{L}_{rank}$ are for the O2O classification subhead, whereas $\mathcal{L}_{cls}^{o2m}$ is for the O2M classification subhead. Meanwhile, $\mathcal{L}_{GIOU}$ (with $g=1$), $\mathcal{L}_{end}$ and $\mathcal{L}_{aux}$ are designated for the O2M regression subhead. The gradiant from the O2O classification subhead to the RoI pooling layer is stopped to keep the quality of the feature learning. $\left( \hat{\theta}_{i,\cdot}^{seg},\hat{r}_{i,\cdot}^{seg} \right)$ is ingnored during evaluation.
|
||||
\label{assign_appendix}
|
||||
|
||||
\section{The Supplement of Implement Detail and Visualization Results.}
|
||||
Some important implement details for each dataset are shown in Table \ref{dataset_info}. It includes the dataset information we employed to conduct experiments and visualizations, the parameters for data processing as well as hyperparameters of Polar R-CNN.
|
||||
|
||||
Fig. \ref{vis_sparse} illustrates the visualization outcomes in sparse scenarios spanning four datasets. The top row depicts the ground truth, while the middle row shows the proposed lane anchors and the bottom row exhibits the predictions generated by Polar R-CNN with NMS-free paradigm. In the top and bottom row, different colors aim to distinguish different lane instances, which do not correspond across the images. From images of the middle row, we can see that LPH of Polar R-CNN effectively proposes anchors that are clustered around the ground truth, providing a robust prior for GPH to achieve the final lane predictions. Moreover, the number of anchors has significantly decreased compared to previous works, making our method faster than other anchor-based methods in theory.
|
||||
|
||||
Fig. \ref{vis_dense} shows the visualization outcomes in dense scenarios. The first column displays the ground truth, while the second and the third columns reveal the detection results with NMS paradigm of large (\textit{i.e.}, the default threshold NMS@50 with 50 pixels) and small (\textit{i.e.}, the optimal threshold NMS@15 with 15 pixels) NMS thresholds, respectively. The final column shows the detection results with NMS-free paradigm. We observe that NMS@50 mistakenly removes some predictions, leading to false negatives, while NMS@15 fails to eliminate some redundant predictions, leading to false positives. This underscores that the trade-off struggles between large and small NMS thresholds. The visualization distinctly demonstrates that distance becomes less effective in dense scenarios. Only the proposed O2O classification subhead, driven by data, can address this issue by capturing semantic distance beyond geometric distance. As shown in the last column of Fig. \ref{vis_dense}, the O2O classification subhead successfully eliminates redundant predictions while preserving dense predictions, despite their minimal geometric distances.
|
||||
\label{vis_appendix}
|
||||
|
||||
\bibliographystyle{IEEEtran}
|
||||
\bibliography{reference}
|
||||
%\newpage
|
||||
\end{appendices}
|
||||
\end{document}
|
@ -1,2 +1,3 @@
|
||||
latexmk -C
|
||||
latexmk -pdf main.tex
|
||||
# latexmk -pdf main2.tex
|
Binary file not shown.
Before Width: | Height: | Size: 86 KiB After Width: | Height: | Size: 94 KiB |
Loading…
x
Reference in New Issue
Block a user