This commit is contained in:
haoru 2024-11-03 17:23:11 +08:00
parent 163eb2fa72
commit 3cda3f0e62
6 changed files with 981 additions and 38 deletions

310
main.bbl Normal file
View File

@ -0,0 +1,310 @@
% Generated by IEEEtran.bst, version: 1.14 (2015/08/26)
\begin{thebibliography}{10}
\providecommand{\url}[1]{#1}
\csname url@samestyle\endcsname
\providecommand{\newblock}{\relax}
\providecommand{\bibinfo}[2]{#2}
\providecommand{\BIBentrySTDinterwordspacing}{\spaceskip=0pt\relax}
\providecommand{\BIBentryALTinterwordstretchfactor}{4}
\providecommand{\BIBentryALTinterwordspacing}{\spaceskip=\fontdimen2\font plus
\BIBentryALTinterwordstretchfactor\fontdimen3\font minus
\fontdimen4\font\relax}
\providecommand{\BIBforeignlanguage}[2]{{%
\expandafter\ifx\csname l@#1\endcsname\relax
\typeout{** WARNING: IEEEtran.bst: No hyphenation pattern has been}%
\typeout{** loaded for the language `#1'. Using the pattern for}%
\typeout{** the default language instead.}%
\else
\language=\csname l@#1\endcsname
\fi
#2}}
\providecommand{\BIBdecl}{\relax}
\BIBdecl
\bibitem{adas}
A.~Bar~Hillel, R.~Lerner, D.~Levi, and G.~Raz, ``Recent progress in road and
lane detection: a survey,'' \emph{Machine vision and applications}, vol.~25,
no.~3, pp. 727--745, 2014.
\bibitem{scnn}
X.~Pan, J.~Shi, P.~Luo, X.~Wang, and X.~Tang, ``Spatial as deep: Spatial cnn
for traffic scene understanding,'' in \emph{Proceedings of the AAAI
conference on artificial intelligence}, vol.~32, no.~1, 2018.
\bibitem{polylanenet}
L.~Tabelini, R.~Berriel, T.~M. Paixao, C.~Badue, A.~F. De~Souza, and
T.~Oliveira-Santos, ``Polylanenet: Lane estimation via deep polynomial
regression,'' in \emph{2020 25th International Conference on Pattern
Recognition (ICPR)}.\hskip 1em plus 0.5em minus 0.4em\relax IEEE, 2021, pp.
6150--6156.
\bibitem{cannyedge}
J.~Canny, ``A computational approach to edge detection,'' \emph{IEEE
Transactions on pattern analysis and machine intelligence}, no.~6, pp.
679--698, 1986.
\bibitem{houghtransform}
J.~Illingworth and J.~Kittler, ``A survey of the hough transform,''
\emph{Computer vision, graphics, and image processing}, vol.~44, no.~1, pp.
87--116, 1988.
\bibitem{kluge1995deformable}
K.~Kluge and S.~Lakshmanan, ``A deformable-template approach to lane
detection,'' in \emph{Proceedings of the Intelligent Vehicles' 95.
Symposium}.\hskip 1em plus 0.5em minus 0.4em\relax IEEE, 1995, pp. 54--59.
\bibitem{lstr}
R.~Liu, Z.~Yuan, T.~Liu, and Z.~Xiong, ``End-to-end lane shape prediction with
transformers,'' in \emph{Proceedings of the IEEE/CVF winter conference on
applications of computer vision}, 2021, pp. 3694--3702.
\bibitem{lanenet}
Z.~Wang, W.~Ren, and Q.~Qiu, ``Lanenet: Real-time lane detection networks for
autonomous driving,'' \emph{arXiv preprint arXiv:1807.01726}, 2018.
\bibitem{bezierlanenet}
Z.~Feng, S.~Guo, X.~Tan, K.~Xu, M.~Wang, and L.~Ma, ``Rethinking efficient lane
detection via curve modeling,'' in \emph{Proceedings of the IEEE/CVF
Conference on Computer Vision and Pattern Recognition}, 2022, pp.
17\,062--17\,070.
\bibitem{yolov10}
A.~Wang, H.~Chen, L.~Liu, K.~Chen, Z.~Lin, J.~Han, and G.~Ding, ``Yolov10:
Real-time end-to-end object detection,'' \emph{arXiv preprint
arXiv:2405.14458}, 2024.
\bibitem{fasterrcnn}
S.~Ren, K.~He, R.~Girshick, and J.~Sun, ``Faster r-cnn: Towards real-time
object detection with region proposal networks,'' \emph{IEEE transactions on
pattern analysis and machine intelligence}, vol.~39, no.~6, pp. 1137--1149,
2016.
\bibitem{laneatt}
L.~Tabelini, R.~Berriel, T.~M. Paixao, C.~Badue, A.~F. De~Souza, and
T.~Oliveira-Santos, ``Keep your eyes on the lane: Real-time attention-guided
lane detection,'' in \emph{Proceedings of the IEEE/CVF conference on computer
vision and pattern recognition}, 2021, pp. 294--302.
\bibitem{clrnet}
T.~Zheng, Y.~Huang, Y.~Liu, W.~Tang, Z.~Yang, D.~Cai, and X.~He, ``Clrnet:
Cross layer refinement network for lane detection,'' in \emph{Proceedings of
the IEEE/CVF conference on computer vision and pattern recognition}, 2022,
pp. 898--907.
\bibitem{nms}
A.~Neubeck and L.~Van~Gool, ``Efficient non-maximum suppression,'' in
\emph{18th international conference on pattern recognition (ICPR'06)},
vol.~3.\hskip 1em plus 0.5em minus 0.4em\relax IEEE, 2006, pp. 850--855.
\bibitem{adnet}
L.~Xiao, X.~Li, S.~Yang, and W.~Yang, ``Adnet: Lane shape prediction via anchor
decomposition,'' in \emph{Proceedings of the IEEE/CVF International
Conference on Computer Vision}, 2023, pp. 6404--6413.
\bibitem{srlane}
C.~Chen, J.~Liu, C.~Zhou, J.~Tang, and G.~Wu, ``Sketch and refine: Towards fast
and accurate lane detection,'' in \emph{Proceedings of the AAAI Conference on
Artificial Intelligence}, vol.~38, no.~2, 2024, pp. 1001--1009.
\bibitem{clrernet}
H.~Honda and Y.~Uchida, ``Clrernet: improving confidence of lane detection with
laneiou,'' in \emph{Proceedings of the IEEE/CVF Winter Conference on
Applications of Computer Vision}, 2024, pp. 1176--1185.
\bibitem{gnn}
Z.~Wu, S.~Pan, F.~Chen, G.~Long, C.~Zhang, and S.~Y. Philip, ``A comprehensive
survey on graph neural networks,'' \emph{IEEE transactions on neural networks
and learning systems}, vol.~32, no.~1, pp. 4--24, 2020.
\bibitem{tusimple}
\BIBentryALTinterwordspacing
{TuSimple}, ``Tusimple benchmark,'' 2020, accessed: September 2020. [Online].
Available: \url{https://github.com/TuSimple/tusimple-benchmark/}
\BIBentrySTDinterwordspacing
\bibitem{llamas}
K.~Behrendt and R.~Soussan, ``Unsupervised labeled lane markers using maps,''
in \emph{Proceedings of the IEEE/CVF international conference on computer
vision workshops}, 2019, pp. 0--0.
\bibitem{curvelanes}
H.~Xu, S.~Wang, X.~Cai, W.~Zhang, X.~Liang, and Z.~Li, ``Curvelane-nas:
Unifying lane-sensitive architecture search and adaptive point blending,'' in
\emph{Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK,
August 23--28, 2020, Proceedings, Part XV 16}.\hskip 1em plus 0.5em minus
0.4em\relax Springer, 2020, pp. 689--704.
\bibitem{dalnet}
Z.~Yu, Q.~Liu, W.~Wang, L.~Zhang, and X.~Zhao, ``Dalnet: A rail detection
network based on dynamic anchor line,'' \emph{IEEE Transactions on
Instrumentation and Measurement}, 2024.
\bibitem{ufld}
Z.~Qin, H.~Wang, and X.~Li, ``Ultra fast structure-aware deep lane detection,''
in \emph{Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK,
August 23--28, 2020, Proceedings, Part XXIV 16}.\hskip 1em plus 0.5em minus
0.4em\relax Springer, 2020, pp. 276--291.
\bibitem{ufldv2}
Z.~Qin, P.~Zhang, and X.~Li, ``Ultra fast deep lane detection with hybrid
anchor driven ordinal classification,'' \emph{IEEE transactions on pattern
analysis and machine intelligence}, vol.~46, no.~5, pp. 2555--2568, 2022.
\bibitem{CondLaneNet}
L.~Liu, X.~Chen, S.~Zhu, and P.~Tan, ``Condlanenet: a top-to-down lane
detection framework based on conditional convolution,'' in \emph{Proceedings
of the IEEE/CVF international conference on computer vision}, 2021, pp.
3773--3782.
\bibitem{fololane}
Z.~Qin, P.~Zhang, and X.~Li, ``Ultra fast deep lane detection with hybrid
anchor driven ordinal classification,'' \emph{IEEE transactions on pattern
analysis and machine intelligence}, vol.~46, no.~5, pp. 2555--2568, 2022.
\bibitem{ganet}
M.~Morley, R.~Atkinson, D.~Savi{\'c}, and G.~Walters, ``Ganet: genetic
algorithm platform for pipe network optimisation,'' \emph{Advances in
engineering software}, vol.~32, no.~6, pp. 467--475, 2001.
\bibitem{bsnet}
H.~Chen, M.~Wang, and Y.~Liu, ``Bsnet: Lane detection via draw b-spline curves
nearby,'' \emph{arXiv preprint arXiv:2301.06910}, 2023.
\bibitem{yolox}
G.~Zheng, L.~Songtao, W.~Feng, L.~Zeming, S.~Jian \emph{et~al.}, ``Yolox:
Exceeding yolo series in 2021,'' \emph{arXiv preprint arXiv:2107.08430},
2021.
\bibitem{sparse}
J.~Liu, Z.~Zhang, M.~Lu, H.~Wei, D.~Li, Y.~Xie, J.~Peng, L.~Tian, A.~Sirasao,
and E.~Barsoum, ``Sparse laneformer,'' \emph{arXiv preprint
arXiv:2404.07821}, 2024.
\bibitem{detr}
N.~Carion, F.~Massa, G.~Synnaeve, N.~Usunier, A.~Kirillov, and S.~Zagoruyko,
``End-to-end object detection with transformers,'' in \emph{European
conference on computer vision}.\hskip 1em plus 0.5em minus 0.4em\relax
Springer, 2020, pp. 213--229.
\bibitem{o2o}
P.~Sun, Y.~Jiang, E.~Xie, W.~Shao, Z.~Yuan, C.~Wang, and P.~Luo, ``What makes
for end-to-end object detection?'' in \emph{International Conference on
Machine Learning}.\hskip 1em plus 0.5em minus 0.4em\relax PMLR, 2021, pp.
9934--9944.
\bibitem{learnNMS}
J.~Hosang, R.~Benenson, and B.~Schiele, ``Learning non-maximum suppression,''
in \emph{Proceedings of the IEEE conference on computer vision and pattern
recognition}, 2017, pp. 4507--4515.
\bibitem{date}
Y.~Chen, Q.~Chen, Q.~Hu, and J.~Cheng, ``Date: Dual assignment for end-to-end
fully convolutional object detection,'' \emph{arXiv preprint
arXiv:2211.13859}, 2022.
\bibitem{o3d}
J.~Wang, L.~Song, Z.~Li, H.~Sun, J.~Sun, and N.~Zheng, ``End-to-end object
detection with fully convolutional network,'' in \emph{Proceedings of the
IEEE/CVF conference on computer vision and pattern recognition}, 2021, pp.
15\,849--15\,858.
\bibitem{relationnet}
H.~Hu, J.~Gu, Z.~Zhang, J.~Dai, and Y.~Wei, ``Relation networks for object
detection,'' in \emph{Proceedings of the IEEE conference on computer vision
and pattern recognition}, 2018, pp. 3588--3597.
\bibitem{resnet}
K.~He, X.~Zhang, S.~Ren, and J.~Sun, ``Deep residual learning for image
recognition,'' in \emph{Proceedings of the IEEE conference on computer vision
and pattern recognition}, 2016, pp. 770--778.
\bibitem{fpn}
T.-Y. Lin, P.~Doll{\'a}r, R.~Girshick, K.~He, B.~Hariharan, and S.~Belongie,
``Feature pyramid networks for object detection,'' in \emph{Proceedings of
the IEEE conference on computer vision and pattern recognition}, 2017, pp.
2117--2125.
\bibitem{linecnn}
X.~Li, J.~Li, X.~Hu, and J.~Yang, ``Line-cnn: End-to-end traffic line detection
with line proposal unit,'' \emph{IEEE Transactions on Intelligent
Transportation Systems}, vol.~21, no.~1, pp. 248--258, 2019.
\bibitem{vanishing}
Y.~Su, Y.~Zhang, T.~Lu, J.~Yang, and H.~Kong, ``Vanishing point constrained
lane detection with a stereo camera,'' \emph{IEEE Transactions on Intelligent
Transportation Systems}, vol.~19, no.~8, pp. 2739--2744, 2017.
\bibitem{dualassign}
S.~Li, C.~He, R.~Li, and L.~Zhang, ``A dual weighting label assignment scheme
for object detection,'' in \emph{Proceedings of the IEEE/CVF conference on
computer vision and pattern recognition}, 2022, pp. 9387--9396.
\bibitem{yolact}
D.~Bolya, C.~Zhou, F.~Xiao, and Y.~J. Lee, ``Yolact: Real-time instance
segmentation,'' in \emph{Proceedings of the IEEE/CVF international conference
on computer vision}, 2019, pp. 9157--9166.
\bibitem{pointnet}
C.~R. Qi, H.~Su, K.~Mo, and L.~J. Guibas, ``Pointnet: Deep learning on point
sets for 3d classification and segmentation,'' in \emph{Proceedings of the
IEEE conference on computer vision and pattern recognition}, 2017, pp.
652--660.
\bibitem{focal}
T.-Y. Lin, P.~Goyal, R.~Girshick, K.~He, and P.~Doll{\'a}r, ``Focal loss for
dense object detection,'' in \emph{Proceedings of the IEEE international
conference on computer vision}, 2017, pp. 2980--2988.
\bibitem{pss}
Q.~Zhou and C.~Yu, ``Object detection made simpler by eliminating heuristic
nms,'' \emph{IEEE Transactions on Multimedia}, vol.~25, pp. 9254--9262, 2023.
\bibitem{adam}
D.~P. Kingma, ``Adam: A method for stochastic optimization,'' \emph{arXiv
preprint arXiv:1412.6980}, 2014.
\bibitem{dla}
F.~Yu, D.~Wang, E.~Shelhamer, and T.~Darrell, ``Deep layer aggregation,'' in
\emph{Proceedings of the IEEE conference on computer vision and pattern
recognition}, 2018, pp. 2403--2412.
\bibitem{resa}
T.~Zheng, H.~Fang, Y.~Zhang, W.~Tang, Z.~Yang, H.~Liu, and D.~Cai, ``Resa:
Recurrent feature-shift aggregator for lane detection,'' in \emph{Proceedings
of the AAAI conference on artificial intelligence}, vol.~35, no.~4, 2021, pp.
3547--3554.
\bibitem{laneaf}
H.~Abualsaud, S.~Liu, D.~B. Lu, K.~Situ, A.~Rangesh, and M.~M. Trivedi,
``Laneaf: Robust multi-lane detection with affinity fields,'' \emph{IEEE
Robotics and Automation Letters}, vol.~6, no.~4, pp. 7477--7484, 2021.
\bibitem{eigenlanes}
D.~Jin, W.~Park, S.-G. Jeong, H.~Kwon, and C.-S. Kim, ``Eigenlanes: Data-driven
lane descriptors for structurally diverse lanes,'' in \emph{Proceedings of
the IEEE/CVF conference on computer vision and pattern recognition}, 2022,
pp. 17\,163--17\,171.
\bibitem{enetsad}
Y.~Hou, Z.~Ma, C.~Liu, and C.~C. Loy, ``Learning lightweight lane detection
cnns by self attention distillation,'' in \emph{Proceedings of the IEEE/CVF
international conference on computer vision}, 2019, pp. 1013--1021.
\bibitem{pointlanenet}
Z.~Chen, Q.~Liu, and C.~Lian, ``Pointlanenet: Efficient end-to-end cnns for
accurate real-time lane detection,'' in \emph{2019 IEEE intelligent vehicles
symposium (IV)}.\hskip 1em plus 0.5em minus 0.4em\relax IEEE, 2019, pp.
2563--2568.
\bibitem{iouloss}
J.~Yu, Y.~Jiang, Z.~Wang, Z.~Cao, and T.~Huang, ``Unitbox: An advanced object
detection network,'' in \emph{Proceedings of the 24th ACM international
conference on Multimedia}, 2016, pp. 516--520.
\bibitem{giouloss}
H.~Rezatofighi, N.~Tsoi, J.~Gwak, A.~Sadeghian, I.~Reid, and S.~Savarese,
``Generalized intersection over union: A metric and a loss for bounding box
regression,'' in \emph{Proceedings of the IEEE/CVF conference on computer
vision and pattern recognition}, 2019, pp. 658--666.
\end{thebibliography}

View File

@ -265,7 +265,7 @@ where $\boldsymbol{c}^{g} \in \mathbb{R}^{2}$ and $\boldsymbol{c}^{l}_{j} \in \m
x_{i,j}^{s}&=-y_{i,j}^{s}\tan \theta _j+\frac{r_{j}^{g}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\boldsymbol{c}^g}{\cos \theta _j},\label{positions}\\ x_{i,j}^{s}&=-y_{i,j}^{s}\tan \theta _j+\frac{r_{j}^{g}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\boldsymbol{c}^g}{\cos \theta _j},\label{positions}\\
i&=1,2,\cdots,N;j=1,2,\cdots,K,\notag i&=1,2,\cdots,N;j=1,2,\cdots,K,\notag
\end{align} \end{align}
where the y-coordinates $\boldsymbol{y}_{j}\equiv \{y_{1,j},y_{2,j},\cdots ,y_{N,j}\}$ of the $j$-th lane anchor are uniformly sampled vertically from the image, as previously mentioned. The proof of Eqs. (\ref{l2g})-(\ref{positions}) can be found in Appendix \ref{proof_l2g}. Then coordinates of the $j$-th lane anchor can be given by $\boldsymbol{\ell}_j=\{\boldsymbol{x}_{j},\boldsymbol{y}_j\}\equiv \left\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots ,(x_{N,j}, y_{N,j})\right\}$. where the y-coordinates $\boldsymbol{y}_{j}\equiv \{y_{1,j},y_{2,j},\cdots ,y_{N,j}\}$ of the $j$-th lane anchor are uniformly sampled vertically from the image, as previously mentioned. The proof of Eqs. (\ref{l2g})-(\ref{positions}) can be found in Appendix \textcolor{red}{A}. Then coordinates of the $j$-th lane anchor can be given by $\boldsymbol{\ell}_j=\{\boldsymbol{x}_{j},\boldsymbol{y}_j\}\equiv \left\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots ,(x_{N,j}, y_{N,j})\right\}$.
\par \par
Given the different level feature maps $\boldsymbol{P}_1, \boldsymbol{P}_2, \boldsymbol{P}_3$ from FPN, we can extract the channel-wise features of each point corresponding to the positions of $\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots,(x_{N,j},y_{N,j})\}_{j=1}^{K}$, respectively denoted as $\boldsymbol{F}_{1,j}, \boldsymbol{F}_{2,j}, \boldsymbol{F}_{3,j}\in \mathbb{R} ^{N\times C_f}$. To enhance representation, similar to \cite{srlane}, we employ a weighted sum strategy to combine features from the three levels by: Given the different level feature maps $\boldsymbol{P}_1, \boldsymbol{P}_2, \boldsymbol{P}_3$ from FPN, we can extract the channel-wise features of each point corresponding to the positions of $\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots,(x_{N,j},y_{N,j})\}_{j=1}^{K}$, respectively denoted as $\boldsymbol{F}_{1,j}, \boldsymbol{F}_{2,j}, \boldsymbol{F}_{3,j}\in \mathbb{R} ^{N\times C_f}$. To enhance representation, similar to \cite{srlane}, we employ a weighted sum strategy to combine features from the three levels by:
\begin{align} \begin{align}
@ -279,7 +279,7 @@ Here, $\boldsymbol{W}_{pool}\in \mathbb{R} ^{d_r\times NC_f}$ is employed to fur
\par \par
\textbf{Triplet Head.} The lane detection head is to classify and regress the lane anchors generated from the LPM based on the ROI pooling features $\{\boldsymbol{F}_{i}^{roi}\in \mathbb{R} ^{d_r}\}_{i=1}^K$. As we know, traditional lane detection head\cite{laneatt} is usually equipped with a \textit{One-to-Many} (O2M) classification subhead and a \textit{One-to-Many} (O2M) regression subhead. However, the one-to-many mechanism (\textit{i.e.}, \textit{many candidates for one ground truth}) will cause redundant predictions for each lane, thus need the NMS post-processing operator. While the NMS is non-differentiable and non-end-to-end, resulting in the challenges of manually setting of hyperparameters and suboptimal of performance. To eliminate NMS post-processing while achieving end-to-end learning, we introduce a triplet head module for lane detection. \textbf{Triplet Head.} The lane detection head is to classify and regress the lane anchors generated from the LPM based on the ROI pooling features $\{\boldsymbol{F}_{i}^{roi}\in \mathbb{R} ^{d_r}\}_{i=1}^K$. As we know, traditional lane detection head\cite{laneatt} is usually equipped with a \textit{One-to-Many} (O2M) classification subhead and a \textit{One-to-Many} (O2M) regression subhead. However, the one-to-many mechanism (\textit{i.e.}, \textit{many candidates for one ground truth}) will cause redundant predictions for each lane, thus need the NMS post-processing operator. While the NMS is non-differentiable and non-end-to-end, resulting in the challenges of manually setting of hyperparameters and suboptimal of performance. To eliminate NMS post-processing while achieving end-to-end learning, we introduce a triplet head module for lane detection.
\par \par
As shown in Fig. \ref{gpm}, the triplet head consists of three components: the O2M classification, the O2M regression, and another \textit{One-to-One} (O2O) classification. The features of each lane anchor $\{\boldsymbol{F}_{j}^{roi}\}$ are fed into the aforementioned three subheads, respectively. To keep both simplicity and efficiency, both the O2M classification and O2M regression subheads apply two \textit{multi-layer perceptions} (MLPs) to $\{\boldsymbol{F}_{j}^{roi}\}$ and then generate the confidence scores $\left\{{s}_j^g\right\}$ by the O2M classification subhead and the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ by the O2M regression subhead for each lane anchor. More details about the O2M classification and O2M regression subheads can be referred to \cite{yolox}. The O2O classification subhead is introduced to generate non-redundant lane candidates within a NMS-free paradigm. However, the direct use of one-to-one strategy (\textit{i.e.}, \textit{assigning one positive anchor for one ground truth lane}) based on the extracted features will damage model's learning\cite{dualassign}\cite{yolov10}. Instead, the proposed O2O classification subhead considers both the \textit{confidence prior} $\left\{{s}_j^g\right\}$ of O2M classification subhead and the \textit{spatial geometric prior} of the polar parameters (\textit{i.e.}, the angle $\theta$ and the radius $r$), and apply these priors to adaptively refine the lane anchor features $\{\boldsymbol{F}_{j}^{roi}\}$, which generates the refined lane anchor features $\{\boldsymbol{D}_{j}^{roi}\}$ and the confidence scores $\left\{\tilde{s}_j^g\right\}$. The structural design draws inspiration from the Fast NMS \cite{yolact}, with further particulars accessible in the Appendix \ref{NMS_appendix}. As shown in Fig. \ref{gpm}, the triplet head consists of three components: the O2M classification, the O2M regression, and another \textit{One-to-One} (O2O) classification. The features of each lane anchor $\{\boldsymbol{F}_{j}^{roi}\}$ are fed into the aforementioned three subheads, respectively. To keep both simplicity and efficiency, both the O2M classification and O2M regression subheads apply two \textit{multi-layer perceptions} (MLPs) to $\{\boldsymbol{F}_{j}^{roi}\}$ and then generate the confidence scores $\left\{{s}_j^g\right\}$ by the O2M classification subhead and the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ by the O2M regression subhead for each lane anchor. More details about the O2M classification and O2M regression subheads can be referred to \cite{yolox}. The O2O classification subhead is introduced to generate non-redundant lane candidates within a NMS-free paradigm. However, the direct use of one-to-one strategy (\textit{i.e.}, \textit{assigning one positive anchor for one ground truth lane}) based on the extracted features will damage model's learning\cite{dualassign}\cite{yolov10}. Instead, the proposed O2O classification subhead considers both the \textit{confidence prior} $\left\{{s}_j^g\right\}$ of O2M classification subhead and the \textit{spatial geometric prior} of the polar parameters (\textit{i.e.}, the angle $\theta$ and the radius $r$), and apply these priors to adaptively refine the lane anchor features $\{\boldsymbol{F}_{j}^{roi}\}$, which generates the refined lane anchor features $\{\boldsymbol{D}_{j}^{roi}\}$ and the confidence scores $\left\{\tilde{s}_j^g\right\}$. The structural design draws inspiration from the Fast NMS \cite{yolact}, with further particulars accessible in Appendix \textcolor{red}{B}.
\par \par
More specifically, the O2O classification subhead first calculates the \textit{semantic distance} between the $i$-th anchor with its x-coordinate $\boldsymbol{x}_{i}$ and the $j$-th anchor with its x-coordinate $\boldsymbol{x}_{j}$ as follows: More specifically, the O2O classification subhead first calculates the \textit{semantic distance} between the $i$-th anchor with its x-coordinate $\boldsymbol{x}_{i}$ and the $j$-th anchor with its x-coordinate $\boldsymbol{x}_{j}$ as follows:
\begin{align} \begin{align}
@ -326,9 +326,9 @@ As stated above, the O2O classification subhead is formed from Eqs. (\ref{edge_l
\end{align} \end{align}
where $\tau_{o2o}$ and $\tau_{o2m}$ are two confidence thresholds. The $\Omega^{pos}$ can allow for non-redundant positive predictions without NMS post-processing as the O2O classification subhead enhances the confidence score variability among similar anchors, making it less sensitive to the two confidence thresholds. where $\tau_{o2o}$ and $\tau_{o2m}$ are two confidence thresholds. The $\Omega^{pos}$ can allow for non-redundant positive predictions without NMS post-processing as the O2O classification subhead enhances the confidence score variability among similar anchors, making it less sensitive to the two confidence thresholds.
\par \par
\textbf{Loss function for GPM.} After obtaining the positive candidate set $\Omega^{pos}$ for the O2O classification subhead, the Hungarian algorithm \cite{detr} is applied to perform label assignment, \textit{i.e.}, a one-to-one assignment between the positive anchors and the ground truth instances. As for the O2M classification and O2M regression subheads, we use the same approach as in SimOTA \cite{yolox} for label assignment. More details about label assignment and cost function can be found in Appendix \ref{giou_appendix} and \ref{assign_appendix}. In the training, the Focal loss \cite{focal} is applied for both O2O classification subhead and the O2M classification subhead, respectively denoted as $\mathcal{L}^{o2o}_{cls}$ and $\mathcal{L}^{o2m}_{cls}$. Furthermore, we adopt the rank loss $\mathcal{L}_{rank}$ \cite{pss} to amplify the disparity between the positive and negative confidences of the O2O classification subhead. Note that, similar to \cite{pss}, we stop the gradient flow from the O2O classification subhead during the training stage to preserve the quality of RoI feature learning. \textbf{Loss function for GPM.} After obtaining the positive candidate set $\Omega^{pos}$ for the O2O classification subhead, the Hungarian algorithm \cite{detr} is applied to perform label assignment, \textit{i.e.}, a one-to-one assignment between the positive anchors and the ground truth instances. As for the O2M classification and O2M regression subheads, we use the same approach as in SimOTA \cite{yolox} for label assignment. More details about label assignment and cost function can be found in Appendix \textcolor{red}{C} and \textcolor{red}{D}. In the training, the Focal loss \cite{focal} is applied for both O2O classification subhead and the O2M classification subhead, respectively denoted as $\mathcal{L}^{o2o}_{cls}$ and $\mathcal{L}^{o2m}_{cls}$. Furthermore, we adopt the rank loss $\mathcal{L}_{rank}$ \cite{pss} to amplify the disparity between the positive and negative confidences of the O2O classification subhead. Note that, similar to \cite{pss}, we stop the gradient flow from the O2O classification subhead during the training stage to preserve the quality of RoI feature learning.
To train the O2M regression subhead, we have redefined the GIoU concept (refer to Appendix \ref{giou_appendix} for more details) and adopt the GIoU loss $\mathcal{L}_{GIoU}^{o2m}$ to regress the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ for each positive lane anchor. The end points of lanes are trained with a $Smooth_{L1}$ loss $\mathcal{L}_{end}^{o2m}$. In addition, we propose an auxiliary loss $\mathcal{L}_{aux}$ to facilitate the learning of global features. As illustrated in Fig. \ref{auxloss}, the anchors and ground truth are divided into several segments, with each anchor segment being regressed to the primary components of the corresponding segment of the ground truth. The auxiliary loss $\mathcal{L}_{aux}$ helps the detection head gain a deeper understanding of the global geometric structure and the auxiliary regression branch is dropped during the evaluation stage. Finally, the classification loss $\mathcal{L} _{cls}^{g}$ and the regression loss $\mathcal{L} _{reg}^{g}$ for GPM are given as follows: To train the O2M regression subhead, we have redefined the GIoU concept (refer to Appendix \textcolor{red}{C} for more details) and adopt the GIoU loss $\mathcal{L}_{GIoU}^{o2m}$ to regress the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ for each positive lane anchor. The end points of lanes are trained with a $Smooth_{L1}$ loss $\mathcal{L}_{end}^{o2m}$. In addition, we propose an auxiliary loss $\mathcal{L}_{aux}$ to facilitate the learning of global features. As illustrated in Fig. \ref{auxloss}, the anchors and ground truth are divided into several segments, with each anchor segment being regressed to the primary components of the corresponding segment of the ground truth. The auxiliary loss $\mathcal{L}_{aux}$ helps the detection head gain a deeper understanding of the global geometric structure and the auxiliary regression branch is dropped during the evaluation stage. Finally, the classification loss $\mathcal{L} _{cls}^{g}$ and the regression loss $\mathcal{L} _{reg}^{g}$ for GPM are given as follows:
\begin{align} \begin{align}
\mathcal{L} _{cls}^{g}&=w^{o2m}_{cls}\mathcal{L}^{o2m}_{cls}+w^{o2o}_{cls}\mathcal{L}^{o2o}_{cls}+w_{rank}\mathcal{L}_{rank}, \mathcal{L} _{cls}^{g}&=w^{o2m}_{cls}\mathcal{L}^{o2m}_{cls}+w^{o2o}_{cls}\mathcal{L}^{o2o}_{cls}+w_{rank}\mathcal{L}_{rank},
\\ \\
@ -373,7 +373,7 @@ For Tusimple, the evaluation is formulated as follows:
where $C_{clip}$ and $S_{clip}$ represent the number of correct points (predicted points within 20 pixels of the ground truth) and the ground truth points, respectively. If the accuracy exceeds 85\%, the prediction is considered correct. TuSimple also reports the \textit{False Positive Rate} ($\mathrm{FPR}=1-\mathrm{Precision}$) and \textit{False Negative Rate} ($\mathrm{FNR}=1-\mathrm{Recall}$) metrics. where $C_{clip}$ and $S_{clip}$ represent the number of correct points (predicted points within 20 pixels of the ground truth) and the ground truth points, respectively. If the accuracy exceeds 85\%, the prediction is considered correct. TuSimple also reports the \textit{False Positive Rate} ($\mathrm{FPR}=1-\mathrm{Precision}$) and \textit{False Negative Rate} ($\mathrm{FNR}=1-\mathrm{Recall}$) metrics.
\subsection{Implement Detail} \subsection{Implement Detail}
All input images are cropped and resized to $800\times320$. Similar to \cite{clrnet}, we apply random affine transformations and random horizontal flips. For the optimization process, we use the AdamW \cite{adam} optimizer with a learning rate warm-up and a cosine decay strategy. The initial learning rate is set to 0.006. The number of sampled points and regression points for each lane anchor are set to 36 and 72, respectively. The power coefficient of cost function $\beta$ is set to 6. The training processing of the whole model (including LPM and GPM) is end-to-end just like \cite{adnet}\cite{srlane}. All the experiments are conducted on a single NVIDIA A100-40G GPU. To make our model simple, we only use CNN-based backbone, namely ResNet\cite{resnet} and DLA34\cite{dla}. Other details can be seen in Appendix \ref{vis_appendix}. All input images are cropped and resized to $800\times320$. Similar to \cite{clrnet}, we apply random affine transformations and random horizontal flips. For the optimization process, we use the AdamW \cite{adam} optimizer with a learning rate warm-up and a cosine decay strategy. The initial learning rate is set to 0.006. The number of sampled points and regression points for each lane anchor are set to 36 and 72, respectively. The power coefficient of cost function $\beta$ is set to 6. The training processing of the whole model (including LPM and GPM) is end-to-end just like \cite{adnet}\cite{srlane}. All the experiments are conducted on a single NVIDIA A100-40G GPU. To make our model simple, we only use CNN-based backbone, namely ResNet\cite{resnet} and DLA34\cite{dla}. Other details can be seen in Appendix \textcolor{red}{E}.
\begin{table*}[htbp] \begin{table*}[htbp]
@ -739,37 +739,37 @@ We also explore the stop-gradient strategy for the O2O classification subhead. A
\textbf{Ablation study on NMS-free block in dense scenarios.} Despite demonstrating the feasibility of replacing NMS with the O2O classification subhead in sparse scenarios, the shortcomings of NMS in dense scenarios remain. To investigate the performance of the NMS-free block in dense scenarios, we conduct experiments on the CurveLanes dataset, as detailed in Table \ref{aba_NMS_dense}. \textbf{Ablation study on NMS-free block in dense scenarios.} Despite demonstrating the feasibility of replacing NMS with the O2O classification subhead in sparse scenarios, the shortcomings of NMS in dense scenarios remain. To investigate the performance of the NMS-free block in dense scenarios, we conduct experiments on the CurveLanes dataset, as detailed in Table \ref{aba_NMS_dense}.
In the traditional NMS post-processing \cite{clrernet}, the default IoU threshold is set to 50 pixels. However, this default setting may not always be optimal, especially in dense scenarios where some lane predictions might be erroneously eliminated. Lowering the IoU threshold increases recall but decreases precision. To find the most effective IoU threshold, we experimented with various values and found that a threshold of 15 pixels achieves the best trade-off, resulting in an F1-score of 86.81\%. In contrast, the NMS-free paradigm with the O2O classification subhead achieves an overall F1-score of 87.29\%, which is 0.48\% higher than the optimal threshold setting in the NMS paradigm. Additionally, both precision and recall are improved under the NMS-free approach. This indicates the O2O classification subhead with proposed GNN block is capable of learning both explicit geometric distance and implicit semantic distances between anchors, thus providing a more effective solution for dense scenarios compared to traditional NMS post-processing. More visualization outcomes can be seen in Appendix \ref{vis_appendix}. In the traditional NMS post-processing \cite{clrernet}, the default IoU threshold is set to 50 pixels. However, this default setting may not always be optimal, especially in dense scenarios where some lane predictions might be erroneously eliminated. Lowering the IoU threshold increases recall but decreases precision. To find the most effective IoU threshold, we experimented with various values and found that a threshold of 15 pixels achieves the best trade-off, resulting in an F1-score of 86.81\%. In contrast, the NMS-free paradigm with the O2O classification subhead achieves an overall F1-score of 87.29\%, which is 0.48\% higher than the optimal threshold setting in the NMS paradigm. Additionally, both precision and recall are improved under the NMS-free approach. This indicates the O2O classification subhead with proposed GNN block is capable of learning both explicit geometric distance and implicit semantic distances between anchors, thus providing a more effective solution for dense scenarios compared to traditional NMS post-processing. More visualization outcomes can be seen in Appendix \textcolor{red}{E}.
\section{Conclusion and Future Work} \section{Conclusion and Future Work}
In this paper, we propose Polar R-CNN to address two key issues in anchor-based lane detection methods. By incorporating a local and global polar coordinate system, our Polar R-CNN achieves improved performance with fewer anchors. Additionally, the introduction of the O2O classification subhead with GNN block allows us to replace the traditional NMS post-processing, and the NMS-free paradigm demonstrates superior performance in dense scenarios. Our model is highly flexible and the number of anchors can be adjusted based on the specific scenario. Polar R-CNN is also deployment-friendly due to its simple structure, making it a potential new baseline for lane detection. Future work could explore new label assignment, anchor sampling strategies and complicated model structures, such as large kernels and attention mechanisms. We also plan to extend Polar R-CNN to video instance and 3D lane detection tasks, utilizing advanced geometric modeling techniques. In this paper, we propose Polar R-CNN to address two key issues in anchor-based lane detection methods. By incorporating a local and global polar coordinate system, our Polar R-CNN achieves improved performance with fewer anchors. Additionally, the introduction of the O2O classification subhead with GNN block allows us to replace the traditional NMS post-processing, and the NMS-free paradigm demonstrates superior performance in dense scenarios. Our model is highly flexible and the number of anchors can be adjusted based on the specific scenario. Polar R-CNN is also deployment-friendly due to its simple structure, making it a potential new baseline for lane detection. Future work could explore new label assignment, anchor sampling strategies and complicated model structures, such as large kernels and attention mechanisms. We also plan to extend Polar R-CNN to video instance and 3D lane detection tasks, utilizing advanced geometric modeling techniques.
% %
\bibliographystyle{IEEEtran} \bibliographystyle{IEEEtran}
\bibliography{reference} \bibliography{reference}
%\newpage % \newpage
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/wsq.jpg}}]{Shengqi Wang} % \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/wsq.jpg}}]{Shengqi Wang}
received the Master degree from Xi'an Jiaotong University, Xi'an, China, in 2022. He is now pursuing for the Ph.D. degree in statistics at Xi'an Jiaotong University. His research interests include low-level computer vision, deep learning, and so on. % received the Master degree from Xi'an Jiaotong University, Xi'an, China, in 2022. He is now pursuing for the Ph.D. degree in statistics at Xi'an Jiaotong University. His research interests include low-level computer vision, deep learning, and so on.
\end{IEEEbiography} % \end{IEEEbiography}
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/ljm.pdf}}]{Junmin Liu} % \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/ljm.pdf}}]{Junmin Liu}
was born in 1982. He received the Ph.D. degree in Mathematics from Xi'an Jiaotong University, Xi'an, China, in 2013. From 2011 to 2012, he served as a Research Assistant with the Department of Geography and Resource Management at the Chinese University of Hong Kong, Hong Kong, China. From 2014 to 2017, he worked as a Visiting Scholar at the University of Maryland, College Park, USA. He is currently a full Professor at the School of Mathematics and Statistics, Xi'an Jiaotong University, Xi'an, China. His research interests are mainly focused on the theory and application of machine learning and image processing. He has published over 60+ research papers in international conferences and journals. % was born in 1982. He received the Ph.D. degree in Mathematics from Xi'an Jiaotong University, Xi'an, China, in 2013. From 2011 to 2012, he served as a Research Assistant with the Department of Geography and Resource Management at the Chinese University of Hong Kong, Hong Kong, China. From 2014 to 2017, he worked as a Visiting Scholar at the University of Maryland, College Park, USA. He is currently a full Professor at the School of Mathematics and Statistics, Xi'an Jiaotong University, Xi'an, China. His research interests are mainly focused on the theory and application of machine learning and image processing. He has published over 60+ research papers in international conferences and journals.
\end{IEEEbiography} % \end{IEEEbiography}
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/xiangyongcao.jpg}}]{Xiangyong Cao (Member, IEEE)} % \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/xiangyongcao.jpg}}]{Xiangyong Cao (Member, IEEE)}
received the B.Sc. and Ph.D. degrees from Xian Jiaotong University, Xian, China, in 2012 and 2018, respectively. From 2016 to 2017, he was a Visiting Scholar with Columbia University, New York, NY, USA. He is an Associate Professor with the School of Computer Science and Technology, Xian Jiaotong University. His research interests include statistical modeling % received the B.Sc. and Ph.D. degrees from Xian Jiaotong University, Xian, China, in 2012 and 2018, respectively. From 2016 to 2017, he was a Visiting Scholar with Columbia University, New York, NY, USA. He is an Associate Professor with the School of Computer Science and Technology, Xian Jiaotong University. His research interests include statistical modeling
and image processing. % and image processing.
\end{IEEEbiography} % \end{IEEEbiography}
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/photo_ZengjieSong.jpg}}]{Zengjie Song} % \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/photo_ZengjieSong.jpg}}]{Zengjie Song}
received the B.S. degree in applied mathematics and the Ph.D. degree in statistics from the Xian Jiaotong University (XJTU), Xian, China, in 2013 and 2020, respectively. From 2017 to 2018, he was a visiting Ph.D. student with the Department of Computer Science, University of Illinois at UrbanaChampaign, Champaign, IL, USA. From 2020 to 2023, he was a Postdoctoral Researcher with the Institute of Automation, Chinese Academy of Sciences, Beijing, China. In May 2023, he returned to the XJTU, where he is currently an Assistant Professor with the Department of Statistics. His research interests include predictive coding, multimodal learning, generative model, and computer vision, with an emphasis on the intersection of machine learning and computational neuroscience. % received the B.S. degree in applied mathematics and the Ph.D. degree in statistics from the Xian Jiaotong University (XJTU), Xian, China, in 2013 and 2020, respectively. From 2017 to 2018, he was a visiting Ph.D. student with the Department of Computer Science, University of Illinois at UrbanaChampaign, Champaign, IL, USA. From 2020 to 2023, he was a Postdoctoral Researcher with the Institute of Automation, Chinese Academy of Sciences, Beijing, China. In May 2023, he returned to the XJTU, where he is currently an Assistant Professor with the Department of Statistics. His research interests include predictive coding, multimodal learning, generative model, and computer vision, with an emphasis on the intersection of machine learning and computational neuroscience.
\end{IEEEbiography} % \end{IEEEbiography}
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/sunkai.jpg}}]{Kai Sun} % \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/sunkai.jpg}}]{Kai Sun}
received his Ph.D. degree in statistics from Xi'an Jiaotong University, Xi'an, China, in 2020. He jointed Xi'an Jiaotong University, China, in 2020, where he is currently an associate professor in School of Mathematics and Statistics. His research interests include deep learning and image processing. Up to now, he has authored and coauthored one monograph and 20+ academic papers, primarily in journals such as IEEE TIP, IEEE TNNLS and others. Additionally, he has published one ESI highly cited paper and ESI hot paper as the first author. % received his Ph.D. degree in statistics from Xi'an Jiaotong University, Xi'an, China, in 2020. He jointed Xi'an Jiaotong University, China, in 2020, where he is currently an associate professor in School of Mathematics and Statistics. His research interests include deep learning and image processing. Up to now, he has authored and coauthored one monograph and 20+ academic papers, primarily in journals such as IEEE TIP, IEEE TNNLS and others. Additionally, he has published one ESI highly cited paper and ESI hot paper as the first author.
\end{IEEEbiography} % \end{IEEEbiography}
\vfill % \vfill
% \newpage
\newpage \clearpage
% 附录有多个section时 % 附录有多个section时
\enablecitations \enablecitations
@ -850,7 +850,7 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
The positive corresponding anchors, $\left\{ \theta _i,r_{i}^{g} \right\} |_{i=1}^{K}$;\\ The positive corresponding anchors, $\left\{ \theta _i,r_{i}^{g} \right\} |_{i=1}^{K}$;\\
The confidence emanating from the O2M classification subhead, $s_i^g$;\\ The confidence emanating from the O2M classification subhead, $s_i^g$;\\
The regressions emanating from the O2M regression subhead, denoted as $\left\{ Lane_i \right\} |_{i=1}^{K}$\\ The regressions emanating from the O2M regression subhead, denoted as $\left\{ Lane_i \right\} |_{i=1}^{K}$\\
The predetermined thresholds $\tau^\theta$, $\tau^r$, $\tau_d$ and $\lambda _{o2m}^{s}$. The predetermined thresholds $\tau^\theta$, $\lambda^g$, $\tau_d$ and $\tau_{o2m}$.
\ENSURE ~~\\ %算法的输出Output \ENSURE ~~\\ %算法的输出Output
\STATE Calculate the confidence-prior adjacency matrix $\boldsymbol{A}^{C}\in\mathbb{R}^{K\times K}$, defined as follows: \STATE Calculate the confidence-prior adjacency matrix $\boldsymbol{A}^{C}\in\mathbb{R}^{K\times K}$, defined as follows:
\begin{align} \begin{align}
@ -863,7 +863,7 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
\STATE Calculate the geometric-prior adjacency matrix $\boldsymbol{A}^{G}\in\mathbb{R}^{K\times K}$, which is defined as follows: \STATE Calculate the geometric-prior adjacency matrix $\boldsymbol{A}^{G}\in\mathbb{R}^{K\times K}$, which is defined as follows:
\begin{align} \begin{align}
A_{ij}^{G}=\begin{cases} A_{ij}^{G}=\begin{cases}
1,\, \mathrm{if}\,\, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\tau^r;\\ 1,\, \mathrm{if}\,\, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\lambda^g;\\
0,\, \mathrm{others}.\\ 0,\, \mathrm{others}.\\
\end{cases} \end{cases}
\label{geometric prior matrix} \label{geometric prior matrix}
@ -896,12 +896,12 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
The new algorithm possesses a distinctly different format from its predecessor\cite{yolact}. We introduce a geometric-prior adjacency matrix characterized by $\boldsymbol{A}^G$, alleviating the suppression relationship between disparate anchors. It is manifestly to demonstrate that, when all elements within $\boldsymbol{A}^{G}$ are all set as $1$ (\textit{i.e.}, disregarding geometric priors), Algorithm \ref{Graph Fast NMS} is equivalent to Fast NMS. Building upon our newly proposed sort-free Fast NMS with geometric prior, we design the structure of the one-to-one classification head. The new algorithm possesses a distinctly different format from its predecessor\cite{yolact}. We introduce a geometric-prior adjacency matrix characterized by $\boldsymbol{A}^G$, alleviating the suppression relationship between disparate anchors. It is manifestly to demonstrate that, when all elements within $\boldsymbol{A}^{G}$ are all set as $1$ (\textit{i.e.}, disregarding geometric priors), Algorithm \ref{Graph Fast NMS} is equivalent to Fast NMS. Building upon our newly proposed sort-free Fast NMS with geometric prior, we design the structure of the one-to-one classification head.
The principal limitations of the NMS lie in two steps, namely the definitions of distance stem from geometry (\textit{i.e.}, Eq. (\ref{al_1-3})) and the threshold $\lambda^{g}$ employed to eliminate redundant predictions (\textit{i.e.}, Eq. (\ref{al_1-4})). For instance, in the scenarios involving double lines, despite the minimal geometric distance between the two lane instances, their semantic divergence is remarkably pronounced. Consequently, we replace the aforementioned two steps with trainable neural networks, allowing them to alleviate the limitation of Fast NMS in a data-driven fashion. The neural network blocks to replace Eq. (\ref{al_1-3}) are Eqs. (\ref{edge_layer_1})-(\ref{edge_layer_3}) in the main text. The principal limitations of the NMS lie in two steps, namely the definitions of distance stem from geometry (\textit{i.e.}, Eq. (\ref{al_1-3})) and the threshold employed to eliminate redundant predictions (\textit{i.e.}, Eq. (\ref{al_1-4})). For instance, in the scenarios involving double lines, despite the minimal geometric distance between the two lane instances, their semantic divergence is remarkably pronounced. Consequently, we replace the aforementioned two steps with trainable neural networks, allowing them to alleviate the limitation of Fast NMS in a data-driven fashion. The neural network blocks to replace Eq. (\ref{al_1-3}) are Eqs. (\ref{edge_layer_1})-(\ref{edge_layer_3}) in the main text.
In Eq. (\ref{edge_layer_3}), the inverse distance $\boldsymbol{D}_{ij}^{edge}\in\mathbb{R}^{d_n}$ transcends its scalar form, encapsulating the semantic distance between predictions. In Eq. (\ref{edge_layer_3}), the inverse distance $\boldsymbol{D}_{ij}^{edge}\in\mathbb{R}^{d_n}$ transcends its scalar form, encapsulating the semantic distance between predictions.
We use element-wise max pooling for the tensor, as the repalcement of the max operation applied to scalar, as delineated in Eq. (\ref{maxpooling}) in the main text. Furthermore, the predetermined $\left( \tau ^d \right) ^{-1}$ is no longer utilized as the threshold of the distance. We defined a neural work as the implicit decision plane to formulate the final score $\tilde{s}_{i}^{g}$, as defined in Eq. (\ref{node_layer}), serving as the replacement of Eq. (\ref{al_1-4}). We use element-wise max pooling for the tensor, as the repalcement of the max operation applied to scalar, as delineated in Eq. (\ref{maxpooling}) in the main text. Furthermore, the predetermined $\left( \tau ^d \right) ^{-1}$ is no longer utilized as the threshold of the distance. We defined a neural work as the implicit decision plane to formulate the final score $\tilde{s}_{i}^{g}$, as defined in Eq. (\ref{node_layer}), serving as the replacement of Eq. (\ref{al_1-4}).
The score $\tilde{s}_{i}^{g}$ output by the neural network transitions from a binary score to a continuous soft score ranging from 0 to 1. We introduce a new threshold $\lambda^s_{o2o}$ within the updated criteria of Eq. (\ref{al_1-5}): The score $\tilde{s}_{i}^{g}$ output by the neural network transitions from a binary score to a continuous soft score ranging from 0 to 1. We introduce a new threshold $\tau_{o2o}$ within the updated criteria of Eq. (\ref{al_1-5}):
\begin{align} \begin{align}
\varOmega_{nms-free}^{pos}=\left\{i|\tilde{s}_{i}^{g}>\tau_{o2o} \right\} \cap \left\{ i|s_{i}^{g}>\tau_{o2m} \right\}. \varOmega_{nms-free}^{pos}=\left\{i|\tilde{s}_{i}^{g}>\tau_{o2o} \right\} \cap \left\{ i|s_{i}^{g}>\tau_{o2m} \right\}.
\end{align} \end{align}
@ -970,7 +970,7 @@ To ensure the IoU between lane instances aligns with the conventions of general
where $w^{b}$ is the base semi-width parameter and $w_{i,p}$ is the actual semi-width of $p$-th lane instance. The sets $\left\{ b_{i,p}^{l} \right\} _{i=1}^{N}$ and $\left\{ b_{i,p}^{r} \right\} _{i=1}^{N}$ signify the left and right boundaries of the $p$-th lane instance. Subsequently, we defined inter and union between lane instances: where $w^{b}$ is the base semi-width parameter and $w_{i,p}$ is the actual semi-width of $p$-th lane instance. The sets $\left\{ b_{i,p}^{l} \right\} _{i=1}^{N}$ and $\left\{ b_{i,p}^{r} \right\} _{i=1}^{N}$ signify the left and right boundaries of the $p$-th lane instance. Subsequently, we defined inter and union between lane instances:
\begin{align} \begin{align}
d_{i,pq}^{\mathcal{O}}&=\max \left( \min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\ d_{i,pq}^{\mathcal{O}}&=\max \left( \min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\
d_{i,pq}^{\xi}&=\max \left( \max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\ d_{i,pq}^{\xi}&=\max \left( \max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) -\min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) , 0 \right),\\
d_{i,pq}^{\mathcal{U}}&=\max \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right). d_{i,pq}^{\mathcal{U}}&=\max \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right).
\end{align} \end{align}
The defination of $\left\{d_{i,pq}^{\mathcal{O}}\right\}_{i=1}^{N}$, $\left\{d_{i,pq}^{\xi}\right\}_{i=1}^{N}$ and $\left\{d_{i,pq}^{\mathcal{U}}\right\}_{i=1}^{N}$ denote the over distance, gap distance, and union distance, respectively. These definitions closely resemble but slightly differ from those in \cite{clrnet} and \cite{adnet}, modifications to ensure non-negative values. This formulation aims to maintain consistency with the IoU definitions used for bounding boxes. Thus, the overall GLaneIoU between the $p$-th and $q$-th lane instances is expressed as: The defination of $\left\{d_{i,pq}^{\mathcal{O}}\right\}_{i=1}^{N}$, $\left\{d_{i,pq}^{\xi}\right\}_{i=1}^{N}$ and $\left\{d_{i,pq}^{\mathcal{U}}\right\}_{i=1}^{N}$ denote the over distance, gap distance, and union distance, respectively. These definitions closely resemble but slightly differ from those in \cite{clrnet} and \cite{adnet}, modifications to ensure non-negative values. This formulation aims to maintain consistency with the IoU definitions used for bounding boxes. Thus, the overall GLaneIoU between the $p$-th and $q$-th lane instances is expressed as:
@ -1242,12 +1242,6 @@ In the one-to-many label assignment, we simply use SimOTA \cite{yolox}, which al
Given the ground truth label generated by the label assignment strategy for each prediction, we can conduct the loss function during phase. As illustrated in Fig. \ref{head_assign}, $\mathcal{L}_{cls}^{o2o}$ and $\mathcal{L}_{rank}$ are for the O2O classification subhead, $\mathcal{L}_{cls}^{o2m}$ is for the O2M classification subhead whereas $\mathcal{L}_{GIOU}$ (with $g=1$), $\mathcal{L}_{end}$ and $\mathcal{L}_{aux}$ for the O2M regression subhead. Given the ground truth label generated by the label assignment strategy for each prediction, we can conduct the loss function during phase. As illustrated in Fig. \ref{head_assign}, $\mathcal{L}_{cls}^{o2o}$ and $\mathcal{L}_{rank}$ are for the O2O classification subhead, $\mathcal{L}_{cls}^{o2m}$ is for the O2M classification subhead whereas $\mathcal{L}_{GIOU}$ (with $g=1$), $\mathcal{L}_{end}$ and $\mathcal{L}_{aux}$ for the O2M regression subhead.
\label{assign_appendix} \label{assign_appendix}
\section{The Supplement of Implement Detail and Visualization Results.} \section{The Supplement of Implement Detail and Visualization Results.}
Some important implement details for each dataset are shown in Table \ref{dataset_info}. It includes the dataset information we employed to conduct experiments and visualizations, the parameters for data processing as well as hyperparameters of Polar R-CNN. Some important implement details for each dataset are shown in Table \ref{dataset_info}. It includes the dataset information we employed to conduct experiments and visualizations, the parameters for data processing as well as hyperparameters of Polar R-CNN.

99
main2.bbl Normal file
View File

@ -0,0 +1,99 @@
% Generated by IEEEtran.bst, version: 1.14 (2015/08/26)
\begin{thebibliography}{10}
\providecommand{\url}[1]{#1}
\csname url@samestyle\endcsname
\providecommand{\newblock}{\relax}
\providecommand{\bibinfo}[2]{#2}
\providecommand{\BIBentrySTDinterwordspacing}{\spaceskip=0pt\relax}
\providecommand{\BIBentryALTinterwordstretchfactor}{4}
\providecommand{\BIBentryALTinterwordspacing}{\spaceskip=\fontdimen2\font plus
\BIBentryALTinterwordstretchfactor\fontdimen3\font minus
\fontdimen4\font\relax}
\providecommand{\BIBforeignlanguage}[2]{{%
\expandafter\ifx\csname l@#1\endcsname\relax
\typeout{** WARNING: IEEEtran.bst: No hyphenation pattern has been}%
\typeout{** loaded for the language `#1'. Using the pattern for}%
\typeout{** the default language instead.}%
\else
\language=\csname l@#1\endcsname
\fi
#2}}
\providecommand{\BIBdecl}{\relax}
\BIBdecl
\bibitem{detr}
N.~Carion, F.~Massa, G.~Synnaeve, N.~Usunier, A.~Kirillov, and S.~Zagoruyko,
``End-to-end object detection with transformers,'' in \emph{European
conference on computer vision}.\hskip 1em plus 0.5em minus 0.4em\relax
Springer, 2020, pp. 213--229.
\bibitem{learnNMS}
J.~Hosang, R.~Benenson, and B.~Schiele, ``Learning non-maximum suppression,''
in \emph{Proceedings of the IEEE conference on computer vision and pattern
recognition}, 2017, pp. 4507--4515.
\bibitem{yolov10}
A.~Wang, H.~Chen, L.~Liu, K.~Chen, Z.~Lin, J.~Han, and G.~Ding, ``Yolov10:
Real-time end-to-end object detection,'' \emph{arXiv preprint
arXiv:2405.14458}, 2024.
\bibitem{o2o}
P.~Sun, Y.~Jiang, E.~Xie, W.~Shao, Z.~Yuan, C.~Wang, and P.~Luo, ``What makes
for end-to-end object detection?'' in \emph{International Conference on
Machine Learning}.\hskip 1em plus 0.5em minus 0.4em\relax PMLR, 2021, pp.
9934--9944.
\bibitem{o3d}
J.~Wang, L.~Song, Z.~Li, H.~Sun, J.~Sun, and N.~Zheng, ``End-to-end object
detection with fully convolutional network,'' in \emph{Proceedings of the
IEEE/CVF conference on computer vision and pattern recognition}, 2021, pp.
15\,849--15\,858.
\bibitem{relationnet}
H.~Hu, J.~Gu, Z.~Zhang, J.~Dai, and Y.~Wei, ``Relation networks for object
detection,'' in \emph{Proceedings of the IEEE conference on computer vision
and pattern recognition}, 2018, pp. 3588--3597.
\bibitem{yolact}
D.~Bolya, C.~Zhou, F.~Xiao, and Y.~J. Lee, ``Yolact: Real-time instance
segmentation,'' in \emph{Proceedings of the IEEE/CVF international conference
on computer vision}, 2019, pp. 9157--9166.
\bibitem{iouloss}
J.~Yu, Y.~Jiang, Z.~Wang, Z.~Cao, and T.~Huang, ``Unitbox: An advanced object
detection network,'' in \emph{Proceedings of the 24th ACM international
conference on Multimedia}, 2016, pp. 516--520.
\bibitem{giouloss}
H.~Rezatofighi, N.~Tsoi, J.~Gwak, A.~Sadeghian, I.~Reid, and S.~Savarese,
``Generalized intersection over union: A metric and a loss for bounding box
regression,'' in \emph{Proceedings of the IEEE/CVF conference on computer
vision and pattern recognition}, 2019, pp. 658--666.
\bibitem{clrnet}
T.~Zheng, Y.~Huang, Y.~Liu, W.~Tang, Z.~Yang, D.~Cai, and X.~He, ``Clrnet:
Cross layer refinement network for lane detection,'' in \emph{Proceedings of
the IEEE/CVF conference on computer vision and pattern recognition}, 2022,
pp. 898--907.
\bibitem{adnet}
L.~Xiao, X.~Li, S.~Yang, and W.~Yang, ``Adnet: Lane shape prediction via anchor
decomposition,'' in \emph{Proceedings of the IEEE/CVF International
Conference on Computer Vision}, 2023, pp. 6404--6413.
\bibitem{date}
Y.~Chen, Q.~Chen, Q.~Hu, and J.~Cheng, ``Date: Dual assignment for end-to-end
fully convolutional object detection,'' \emph{arXiv preprint
arXiv:2211.13859}, 2022.
\bibitem{clrernet}
H.~Honda and Y.~Uchida, ``Clrernet: improving confidence of lane detection with
laneiou,'' in \emph{Proceedings of the IEEE/CVF Winter Conference on
Applications of Computer Vision}, 2024, pp. 1176--1185.
\bibitem{yolox}
G.~Zheng, L.~Songtao, W.~Feng, L.~Zeming, S.~Jian \emph{et~al.}, ``Yolox:
Exceeding yolo series in 2021,'' \emph{arXiv preprint arXiv:2107.08430},
2021.
\end{thebibliography}

539
main2.tex Normal file
View File

@ -0,0 +1,539 @@
\documentclass[lettersize,journal]{IEEEtran}
\usepackage{amsmath,amsfonts}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{array}
% \usepackage[caption=false,font=normalsize,labelfont=sf,textfont=sf]{subfig}
\usepackage{textcomp}
\usepackage{stfloats}
\usepackage{url}
\usepackage{verbatim}
\usepackage{graphicx}
\usepackage{cite}
\usepackage{subcaption}
\usepackage{multirow}
\usepackage[T1]{fontenc}
\usepackage{adjustbox}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{tikz}
\usepackage{tabularx}
\usepackage{mathrsfs}
\usepackage{etoolbox}
% 定义一个命令来禁用参考文献引用
\newcommand{\disablecitations}{%
\renewcommand{\cite}[1]{}%
}
% 定义一个命令来恢复参考文献引用
\newcommand{\enablecitations}{%
\let\cite\oldcite%
}
% 保存原始的 \cite 命令
\let\oldcite\cite
\usepackage[colorlinks,bookmarksopen,bookmarksnumbered, linkcolor=red]{hyperref}
\definecolor{darkgreen}{RGB}{17,159,27} %
\aboverulesep=0pt
\belowrulesep=0pt
\hyphenation{op-tical net-works semi-conduc-tor IEEE-Xpolare}
% updated with editorial comments 8/9/2021
% \renewcommand{\includegraphics}[2][]{} % 重定义 \includegraphics 命令为空操作
\begin{document}
\disablecitations
\enablecitations
\title{Appendix and Supplementary Materials}
\markboth{Appendix and Supplementary Materials}%
{Appendix and Supplementary Materials}
\maketitle
\begin{appendices}
\setcounter{table}{0} %从0开始编号显示出来表会A1开始编号
\setcounter{figure}{0}
\setcounter{section}{0}
\setcounter{equation}{0}
\renewcommand{\thetable}{A\arabic{table}}
\renewcommand{\thefigure}{A\arabic{figure}}
\renewcommand{\thesection}{A\arabic{section}}
\renewcommand{\theequation}{A\arabic{equation}}
\addcontentsline{toc}{section}{Appendix} % 如果需要将附录标题添加到目录中
\section{Details about the Coordinate Systems}
In this section, we introduce the details about the coordinate systems employed in our model and coordinate transformations between them.
For convenience, we adopted Cartesian coordinate system instead of the image coordinate system, wherein the y-axis is oriented from bottom to top and the x-axis from left to right. The coordinates of of the local poles $\left\{\boldsymbol{c}^l_i\right\}$, the global pole $\boldsymbol{c}^g$, and the sampled points $\{(x_{1,j}^s,y_{1,j}^s),(x_{2,j}^s,y_{2,j}^s),\cdots,(x_{N,j}^s,y_{N,j}^s)\}_{j=1}^{K}$ of anchors are all within this coordinate by default.
We now furnish the derivation of the transformations between different coordinate systems, with the crucial symbols elucidated in Fig. \ref{elu_proof}. These geometric transformations can be demonstrated with Analytic geometry theory in Euclidean space. The derivation of local to global polar coordinate system is presented as follows:
\begin{align}
r_{j}^{g}&=\left\| \overrightarrow{c^gh_{j}^{g}} \right\| =\left\| \overrightarrow{h_{j}^{a}h_{j}^{l}} \right\| =\left\| \overrightarrow{h_{j}^{a}h_{j}^{l}} \right\| \notag\\
&=\left\| \overrightarrow{c_{j}^{l}h_{j}^{l}}-\overrightarrow{h_{j}^{a}c_{j}^{l}} \right\| =\left\| \overrightarrow{c_{j}^{l}h_{j}^{l}} \right\| -\left\| \overrightarrow{c_{j}^{l}h_{j}^{a}} \right\| \notag\\
&=\left\| \overrightarrow{c_{j}^{l}h_{j}^{l}} \right\| - \frac{\overrightarrow{c_{j}^{l}h_{j}^{a}}}{\left\| \overrightarrow{c_{j}^{l}h_{j}^{a}} \right\|}\cdot \overrightarrow{c_{j}^{l}h_{j}^{a}} =\left\| \overrightarrow{c_{j}^{l}h_{j}^{l}} \right\| +\frac{\overrightarrow{c_{j}^{l}h_{j}^{a}}}{\left\| \overrightarrow{c_{j}^{l}h_{j}^{a}} \right\|}\cdot \overrightarrow{c^gc_{j}^{l}} \notag\\
&=r_{j}^{l}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\left( \boldsymbol{c}_{j}^{l}-\boldsymbol{c}^g \right),
\label{proof_l2g}
\end{align}
where $h_j^l$, $h_j^g$ and $h_j^a$ represent the foots of their respective perpendiculars in Fig. \ref{elu_proof}.
Analogously, the derivation of sampling points along a lane anchor is provided as follows:
\begin{align}
&\overrightarrow{c^gp_{i,j}^{s}}\cdot \overrightarrow{c^gh_{j}^{g}}=\overrightarrow{c^gh_{j}^{g}}\cdot \overrightarrow{c^gh_{j}^{g}} \notag\\
\Rightarrow &\overrightarrow{c^gp_{i,j}^{s}}\cdot \overrightarrow{c^gh_{j}^{g}}=\left\| \overrightarrow{c^gh_{j}^{g}} \right\| \left\| \overrightarrow{c^gh_{j}^{g}} \right\| \notag\\
\Rightarrow &\frac{\overrightarrow{c^gh_{j}^{g}}}{\left\| \overrightarrow{c^gh_{j}^{g}} \right\|}\cdot \overrightarrow{c^gp_{i,j}^{s}}=\left\| \overrightarrow{c^gh_{j}^{g}} \right\| \notag\\
\Rightarrow &\left[ \cos \theta _j;\sin \theta _j \right] ^T\left( \boldsymbol{p}_{i,j}^{s}-\boldsymbol{c}^g \right) =r_{j}^{g}\notag\\
\Rightarrow &x_{i,j}^{s}\cos \theta _j+y_{i,j}^{s}\sin \theta _j=r_{j}^{g}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\boldsymbol{c}^g \notag\\
\Rightarrow &x_{i,j}^{s}=-y_{i,j}^{s}\tan \theta _j+\frac{r_{j}^{g}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\boldsymbol{c}^g}{\cos \theta _j},
\label{proof_sample}
\end{align}
where $p_{i,j}^{s}$ represents the $i$-th sampled point of the $j$-th lane anchor, whose coordinate is $\boldsymbol{p}_{i,j}^{s}\equiv(x_{i,j}^s, y_{i,j}^s)$.
\label{appendix_coord}
\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{thesis_figure/elu_proof.png}
\caption{The symbols employed in the derivation of coordinate transformations across different coordinate systems.}
\label{elu_proof}
\end{figure}
\section{The Design Principles of the One-to-one classification Head}
Two fundamental prerequisites of the NMS-free framework lie in the label assignment strategies and the head structures.
As for the label assignment strategy, previous work use one-to-many label assignments, which make the detection head make redundant predictions for one ground truth, resulting in the need of NMS post-processing. Thus, some works \cite{detr}\cite{learnNMS} proposed one-to-one label assignment such as Hungarian algorithm. This force the model to predict one positive sample for each lane.
However, directly using one-to-one label assignment damage the learning of the model, and structures such as MLPs and CNNs struggle to assimilate the ``one-to-one'' characteristics, resulting in the decreasing of performance compared to one-to-many label assignments with NMS post-processing\cite{yolov10}\cite{o2o}. Consider a trivial example: Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from the $i$-th anchor, and the model is trained with one-to-one label assignment. Assuming that the $i$-th anchor and the $j$-th anchor are both close to the ground truth and overlap with each other. So the corresponding RoI features are similar, which can be expressed as follows:
\begin{align}
\boldsymbol{F}_{i}^{roi}\approx \boldsymbol{F}_{j}^{roi}.
\end{align}
Suppose that $\boldsymbol{F}^{roi}_{i}$ is assigned as a positive sample while $\boldsymbol{F}^{roi}_{j}$ as a negative sample, the ideal outcome should manifest as:
\begin{align}
f_{cls}\left( \boldsymbol{F}_{i}^{roi} \right) &\rightarrow 1, \notag\\
f_{cls}\left( \boldsymbol{F}_{j}^{roi} \right) &\rightarrow 0,
\label{sharp fun}
\end{align}
where $f_{cls}$ represents a classification head with an ordinary structure such as MLPs and CNNs. The Eq. (\ref{sharp fun}) implies that the property of $f_{cls}$ need to be ``sharp'' enough to differentiate between two similar features. In other words, the output of $f_{cls}$ changes rapidly over short periods or distances. This ``sharp'' pattern is hard to train for MLPs or CNNs solely. Consequently, additional new heuristic structures like \cite{o3d}\cite{relationnet} need to be developed.
We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classification subhead. Fast NMS serves as an iteration-free post-processing algorithm based on traditional NMS. Furthermore, we have incorporated a sort-free strategy along with geometric priors into Fast NMS, with the specifics delineated in Algorithm \ref{Graph Fast NMS}.
\begin{algorithm}[t]
\caption{Fast NMS with Geometric Prior.}
\begin{algorithmic}[1] %这个1 表示每一行都显示数字
\REQUIRE ~~\\ %算法的输入参数Input
The index of all anchors, $1, 2, \cdots, i, \cdots, K$;\\
The positive corresponding anchors, $\left\{ \theta _i,r_{i}^{g} \right\} |_{i=1}^{K}$;\\
The confidence emanating from the O2M classification subhead, $s_i^g$;\\
The regressions emanating from the O2M regression subhead, denoted as $\left\{ Lane_i \right\} |_{i=1}^{K}$\\
The predetermined thresholds $\tau^\theta$, $\lambda^g$, $\tau_d$ and $\tau_{o2m}$.
\ENSURE ~~\\ %算法的输出Output
\STATE Calculate the confidence-prior adjacency matrix $\boldsymbol{A}^{C}\in\mathbb{R}^{K\times K}$, defined as follows:
\begin{align}
A_{ij}^{C}=\begin{cases}
1,\, \mathrm{if}\,\, s_i>s_j\,\,or\,\,\left( s_i^g=s_j^g\,\,and\,\,i>j \right);\\
0,\, \mathrm{others}.\\
\end{cases}
\label{confidential matrix}
\end{align}
\STATE Calculate the geometric-prior adjacency matrix $\boldsymbol{A}^{G}\in\mathbb{R}^{K\times K}$, which is defined as follows:
\begin{align}
A_{ij}^{G}=\begin{cases}
1,\, \mathrm{if}\,\, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\lambda^g;\\
0,\, \mathrm{others}.\\
\end{cases}
\label{geometric prior matrix}
\end{align}
\STATE Calculate the inverse distance matrix $\boldsymbol{D} \in \mathbb{R} ^{K \times K}$ The element $D_{ij}$ in $\boldsymbol{D}$ is defined as follows:
\begin{align}
D_{ij}=d^{-1}\left( Lane_i,Lane_j \right),
\label{al_1-3}
\end{align}
where $d\left(\cdot, \cdot \right)$ is some predefined function to quantify the distance between two lane predictions such as IoU.
\STATE Define the adjacent matrix $\boldsymbol{A} = \boldsymbol{A}^{C} \odot \boldsymbol{A}^{G}$ and the final confidence $\tilde{s}_i^g$ is calculate as following:
\begin{align}
\tilde{s}_{i}^{g}=\begin{cases}
1,\, \mathrm{if}\,\, \mathrm{Max}\left(\mathcal{D}(:,j)|\boldsymbol{A}(:,j)=1\right)<\left( \tau ^d \right) ^{-1};\\
0,\, \mathrm{others},\\
\end{cases}
\label{al_1-4}
\end{align}
where $j=1,2,\cdots,K$ and $\mathrm{Max}(\cdot|\boldsymbol{A}(:,j)=1)$ is a max operator along the $j$-th column of adjacency matrix $\boldsymbol{A}$ with the element $A_{:j}=1$.
\STATE Get the final selection set:
\begin{align}
\varOmega_{nms}^{pos}=\left\{ i|\tilde{s}_{j}^{g}=1 \right\} \cap \left\{i|s_{i}^{g}>\tau_{o2m} \right\}.
\label{al_1-5}
\end{align}
\RETURN The final selection result $\varOmega_{nms}^{pos}$.
\end{algorithmic}
\label{Graph Fast NMS}
\end{algorithm}
The new algorithm possesses a distinctly different format from its predecessor\cite{yolact}. We introduce a geometric-prior adjacency matrix characterized by $\boldsymbol{A}^G$, alleviating the suppression relationship between disparate anchors. It is manifestly to demonstrate that, when all elements within $\boldsymbol{A}^{G}$ are all set as $1$ (\textit{i.e.}, disregarding geometric priors), Algorithm \ref{Graph Fast NMS} is equivalent to Fast NMS. Building upon our newly proposed sort-free Fast NMS with geometric prior, we design the structure of the one-to-one classification head.
The principal limitations of the NMS lie in two steps, namely the definitions of distance stem from geometry (\textit{i.e.}, Eq. (\ref{al_1-3})) and the threshold employed to eliminate redundant predictions (\textit{i.e.}, Eq. (\ref{al_1-4})). For instance, in the scenarios involving double lines, despite the minimal geometric distance between the two lane instances, their semantic divergence is remarkably pronounced. Consequently, we replace the aforementioned two steps with trainable neural networks, allowing them to alleviate the limitation of Fast NMS in a data-driven fashion. The neural network blocks to replace Eq. (\ref{al_1-3}) are as follows:
\begin{align}
\widehat{\boldsymbol{F}}_{i}^{roi}&\gets \mathrm{ReLU}\left( \boldsymbol{W}_{roi}\boldsymbol{F}_{i}^{roi}+\boldsymbol{b}_{roi} \right), i=1,\cdots,K,\label{edge_layer_1}\\
\boldsymbol{F}_{ij}^{edge}&\gets \boldsymbol{W}_{in}\widehat{\boldsymbol{F}}_{j}^{roi}-\boldsymbol{W}_{out}\widehat{\boldsymbol{F}}_{i}^{roi},\label{edge_layer_2}\\
\boldsymbol{D}_{ij}^{edge}&\gets \mathrm{MLP}_{edge}\left(\boldsymbol{F}_{ij}^{edge}+\boldsymbol{W}_s\left( \boldsymbol{x}_{j}-\boldsymbol{x}_{i} \right) +\boldsymbol{b}_s \right).\label{edge_layer_3}
\end{align}
In Eq. (\ref{edge_layer_3}), the inverse distance $\boldsymbol{D}_{ij}^{edge}\in\mathbb{R}^{d_n}$ transcends its scalar form, encapsulating the semantic distance between predictions. We use element-wise max pooling for the tensor, as the repalcement of the max operation applied to scalar. Furthermore, the predetermined $\left( \tau ^d \right) ^{-1}$ is no longer utilized as the threshold of the distance. We defined a neural work as the implicit decision plane to formulate the final score $\tilde{s}_{i}^{g}$
\begin{align}
\boldsymbol{D}_j^{roi}&\gets\mathrm{MPool}_{col}\left(\mathcal{D}^{edge}(:,j,:)|\boldsymbol{A}(:,j)=1\right), \label{maxpooling}\\
\tilde{s}_{j}^{g}&\gets \mathrm{MLP}_{roi}\left( \boldsymbol{D}_{j}^{roi} \right), j=1,\cdots,K, \label{node_layer}
\end{align}
which serves as the replacement of Eq. (\ref{al_1-4}).
The score $\tilde{s}_{i}^{g}$ output by the neural network transitions from a binary score to a continuous soft score ranging from 0 to 1. We introduce a new threshold $\tau_{o2o}$ within the updated criteria of Eq. (\ref{al_1-5}):
\begin{align}
\varOmega_{nms-free}^{pos}=\left\{i|\tilde{s}_{i}^{g}>\tau_{o2o} \right\} \cap \left\{ i|s_{i}^{g}>\tau_{o2m} \right\}.
\end{align}
This criteria is also referred to as the \textit{dual confidence selection} in the main text.
\label{NMS_appendix}
\begin{table*}[t]
\centering
\caption{Infos and hyperparameters for five datasets. For the CULane dataset, $*$ denotes the actual number of training samples used to train the model. Labels for some validation/test sets are missing and different splits (\textit{i.e.}, validation and test set) are selected for different datasets.}
\begin{adjustbox}{width=\linewidth}
\begin{tabular}{l|l|ccccc}
\toprule
\multicolumn{2}{c|}{\textbf{Dataset}} & CULane & TUSimple & LLAMAS & DL-Rail & CurveLanes \\
\midrule
\multirow{7}*{Dataset Description}
& Train &88,880/$55,698^{*}$&3,268 &58,269&5,435&100,000\\
& Validation &9,675 &358 &20,844&- &20,000 \\
& Test &34,680&2,782 &20,929&1,569&- \\
& Resolution &$1640\times590$&$1280\times720$&$1276\times717$&$1920\times1080$&$2560\times1440$, etc\\
& Lane &$\leqslant4$&$\leqslant5$&$\leqslant4$&$=2$&$\leqslant10$\\
& Environment &urban and highway & highway&highway&railay&urban and highway\\
& Distribution &sparse&sparse&sparse&sparse&sparse and dense\\
\midrule
\multirow{2}*{Dataset Split}
& Evaluation &Test&Test&Test&Test&Val\\
& Visualization &Test&Test&Val&Test&Val\\
\midrule
\multirow{1}*{Data Preprocess}
& Crop Height &270&160&300&560&640, etc\\
\midrule
\multirow{6}*{Training Hyperparameter}
& Epoch Number &32&70&20&90&32\\
& Batch Size &40&24&32&40&40\\
& Warm up iterations &800&200&800&400&800\\
& $w_{aux}$ &0.2&0 &0.2&0.2&0.2\\
& $w_{rank}$ &0.7&0.7&0.1&0.7&0 \\
\midrule
\multirow{4}*{Evaluation Hyperparameter}
& $H^{l}\times W^{l}$ &$4\times10$&$4\times10$&$4\times10$&$4\times10$&$6\times13$\\
& $K$ &20&20&20&12&50\\
& $d_n$ &5&8&10&5&5\\
& $\tau_{o2m}$ &0.48&0.40&0.40&0.40&0.45\\
& $\tau_{o2o}$ &0.46&0.46&0.46&0.46&0.44\\
\bottomrule
\end{tabular}
\end{adjustbox}
\label{dataset_info}
\end{table*}
\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{thesis_figure/GLaneIoU.png} % 替换为你的图片文件名
\caption{Illustrations of GLaneIoU redefined in our work.}
\label{glaneiou}
\end{figure}
\section{Details of Intersection Over Union between Lane Instances}
To ensure the IoU between lane instances aligns with the conventions of general object detection methods \cite{iouloss}\cite{giouloss}, we have redefined the IoU of lane pairs. As depicted in Fig. \ref{glaneiou}, the newly defined IoU for lanes pairs, which we refer to as GLaneIoU, is elaborated as follows:
\begin{align}
\Delta x_{i,p}^{d}&=x_{i+1,p}^{d}-x_{i-1,p}^{d},\,\, \Delta y_{i,p}^{d}=y_{i+1,p}^{d}-y_{i-1,p}^{d}, \\
w_{i,p}&=\frac{\sqrt{\left( \Delta x_{i,p}^{d} \right) ^2+\left( \Delta y_{i,p}^{d} \right) ^2}}{\Delta y_{i,p}^{d}}w^b,\\
b_{i,p}^{l}&=x_{i,p}^{d}-w_{i,p},\,\, b_{i,p}^{r}=x_{i,p}^{d}+w_{i,p},
\end{align}
where $w^{b}$ is the base semi-width parameter and $w_{i,p}$ is the actual semi-width of $p$-th lane instance. The sets $\left\{ b_{i,p}^{l} \right\} _{i=1}^{N}$ and $\left\{ b_{i,p}^{r} \right\} _{i=1}^{N}$ signify the left and right boundaries of the $p$-th lane instance. Subsequently, we defined inter and union between lane instances:
\begin{align}
d_{i,pq}^{\mathcal{O}}&=\max \left( \min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\
d_{i,pq}^{\xi}&=\max \left( \max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) -\min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) , 0 \right),\\
d_{i,pq}^{\mathcal{U}}&=\max \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right).
\end{align}
The defination of $\left\{d_{i,pq}^{\mathcal{O}}\right\}_{i=1}^{N}$, $\left\{d_{i,pq}^{\xi}\right\}_{i=1}^{N}$ and $\left\{d_{i,pq}^{\mathcal{U}}\right\}_{i=1}^{N}$ denote the over distance, gap distance, and union distance, respectively. These definitions closely resemble but slightly differ from those in \cite{clrnet} and \cite{adnet}, modifications to ensure non-negative values. This formulation aims to maintain consistency with the IoU definitions used for bounding boxes. Thus, the overall GLaneIoU between the $p$-th and $q$-th lane instances is expressed as:
\begin{align}
GIoU\left( p,q \right)=\frac{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{O}}}}{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{U}}}}-g\frac{\sum\nolimits_{i=j}^k{d_{i,pq}^{\xi}}}{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{U}}}},
\end{align}
where j and k are the indices of the start point and the end point, respectively. It's evident that when $g=0$, the $GIoU$ for lane pairs corresponds to that for bounding box, with a value range of $\left[0, 1 \right]$. When $g=1$, the $GIoU$ for lane pairs corresponds to that for bounding box, with a value range of $\left(-1, 1 \right]$.
\label{giou_appendix}
\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{thesis_figure/detection_head_assign.png}
\caption{Label assignment and loss function for the triplet head.}
\label{head_assign}
\end{figure}
\begin{figure*}[t]
\centering
\def\pagewidth{0.49\textwidth}
\def\subwidth{0.47\linewidth}
\def\imgwidth{\linewidth}
\def\imgheight{0.5625\linewidth}
\def\dashheight{0.8\linewidth}
\begin{subfigure}{\pagewidth}
\rotatebox{90}{\small{GT}}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/1_gt.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/2_gt.jpg}
\end{minipage}
\end{subfigure}
\begin{subfigure}{\pagewidth}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/1_gt.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/2_gt.jpg}
\end{minipage}
\end{subfigure}
\vspace{0.5em}
\begin{subfigure}{\pagewidth}
\raisebox{-1.5em}{\rotatebox{90}{\small{Anchors}}}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/1_anchor.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/2_anchor.jpg}
\end{minipage}
\end{subfigure}
\begin{subfigure}{\pagewidth}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/1_anchor.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/2_anchor.jpg}
\end{minipage}
\end{subfigure}
\vspace{0.5em}
\begin{subfigure}{\pagewidth}
\raisebox{-2em}{\rotatebox{90}{\small{Predictions}}}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/1_pred.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/culane/2_pred.jpg}
\end{minipage}
\caption{CULane}
\end{subfigure}
\begin{subfigure}{\pagewidth}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/1_pred.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/tusimple/2_pred.jpg}
\end{minipage}
\caption{TuSimple}
\end{subfigure}
\vspace{0.5em}
% \begin{tikzpicture}
% \draw[dashed, pattern=on 8pt off 2pt, color=gray, line width=1pt] (-\textwidth/2,0) -- (\textwidth/2.,0);
% \end{tikzpicture}
% \vspace{0.05em}
\begin{subfigure}{\pagewidth}
\rotatebox{90}{\small{GT}}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/1_gt.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/2_gt.jpg}
\end{minipage}
\end{subfigure}
\begin{subfigure}{\pagewidth}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/1_gt.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/2_gt.jpg}
\end{minipage}
\end{subfigure}
\vspace{0.5em}
\begin{subfigure}{\pagewidth}
\raisebox{-1.5em}{\rotatebox{90}{\small{Anchors}}}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/1_anchor.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/2_anchor.jpg}
\end{minipage}
\end{subfigure}
\begin{subfigure}{\pagewidth}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/1_anchor.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/2_anchor.jpg}
\end{minipage}
\end{subfigure}
\vspace{0.5em}
\begin{subfigure}{\pagewidth}
\raisebox{-2em}{\rotatebox{90}{\small{Predictions}}}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/1_pred.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/llamas/2_pred.jpg}
\end{minipage}
\caption{LLAMAS}
\end{subfigure}
\begin{subfigure}{\pagewidth}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/1_pred.jpg}
\end{minipage}
\begin{minipage}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_dataset/dlrail/2_pred.jpg}
\end{minipage}
\caption{DL-Rail}
\end{subfigure}
\vspace{0.5em}
\caption{Visualization of detection outcomes in sparse scenarios of four datasets.}
\label{vis_sparse}
\end{figure*}
\begin{figure*}[t]
\centering
\def\subwidth{0.24\textwidth}
\def\imgwidth{\linewidth}
\def\imgheight{0.5625\linewidth}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun_gt.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun_pred50.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun_pred15.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun_NMSfree.jpg}
\end{subfigure}
\vspace{0.5em}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun2_gt.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun2_pred50.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun2_pred15.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/redun2_NMSfree.jpg}
\end{subfigure}
\vspace{0.5em}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less_gt.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less_pred50.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less_pred15.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less_NMSfree.jpg}
\end{subfigure}
\vspace{0.5em}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less2_gt.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less2_pred50.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less2_pred15.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/less2_NMSfree.jpg}
\end{subfigure}
\vspace{0.5em}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all_gt.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all_pred50.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all_pred15.jpg}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all_NMSfree.jpg}
\end{subfigure}
\vspace{0.5em}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all2_gt.jpg}
\caption{GT}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all2_pred50.jpg}
\caption{NMS@50}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all2_pred15.jpg}
\caption{NMS@15}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/view_nms/all2_NMSfree.jpg}
\caption{NMSFree}
\end{subfigure}
\vspace{0.5em}
\caption{Visualization of the detection outcomes in sparse and dense scenarios on the CurveLanes dataset.}
\label{vis_dense}
\end{figure*}
\section{Details about The Label assignment and Loss function.}
Details about cost function and label assignments for the triplet head are furnished here. A dual label assignment strategy \cite{date} is employed for the triplet head, as illustrated in Fig. \ref{head_assign}. Specifically, we implement one-to-many label assignments for both the O2O classification subhead and the O2M regression subhead. This section closely aligns with previous work \cite{clrernet}. To endow our model with NMS-free paradigm, we additionally incorporate the O2O classification subhead and apply a one-to-one label assignment to it.
The cost metrics for both one-to-one and one-to-many label assignments are articulated as follows:
\begin{align}
\mathcal{C} _{p,q}^{o2o}=\tilde{s}_{p}^{g}\times \left( GIoU\left( p,q \right) \right) ^{\beta} \label{o2o_cost},\\
\mathcal{C} _{p,q}^{o2m}=s_{p}^{g}\times \left( GIoU\left( p,q \right) \right) ^{\beta}, \label{o2m_cost}
\end{align}
where $\mathcal{C} _{pq}^{o2o}$ and $\mathcal{C} _{pq}^{o2m}$ denote the cost metric between $p$-th prediction and $q$-th ground truth and $g$ in $GIoU$ are set to $0$ to ensure it maintains non-negative. These metrics imply that both the confidence score and geometric distance contribute to the cost metrics.
Suppose that there exist $K$ predictions and $G$ ground truth. Let $\pi$ denotes the one-to-one label assignment strategy and $\pi(q)$ represent that the $\pi(q)$-th prediction is assigned to the $q$-th anchor. Additionally, $\mathscr{S}_{K, G}$ denotes the set of all possible one-to-one assignment strategies for K predictions and G ground truth. It's straightforward to demonstrate that the total number of one-to-one assignment strategies $\left| \mathscr{S} _{K,G} \right|$ is $\frac{K!}{\left( K-G \right)!}$. The final optimal assignment $\hat{\pi}$ is determined as follows:
\begin{align}
\hat{\pi}=\underset{\pi \in \mathscr{S}_{K,G}}{arg\max}\sum_{q=1}^G{\mathcal{C} _{\pi \left( q \right) ,q}^{o2o}}.
\end{align}
This assignment problem can be solved by Hungarian algorithm \cite{detr}. Finally, $G$ predictions are assigned as positive samples and $K-G$ predictions are assigned as negative samples.
In the one-to-many label assignment, we simply use SimOTA \cite{yolox}, which aligns with previous works \cite{clrernet}. Omitting the detailed process of SimOTA, we only introduce the inputs to it, namely the cost matrix $\boldsymbol{M}^C\in \mathbb{R}^{G\times K}$ and the IoU matrix $\boldsymbol{M}^{IoU}\in \mathbb{R}^{G\times K}$. The elements in the two matrices are defined as $M^C_{qp}=\mathcal{C} _{p,q}^{o2m}$ and $M^{IoU}_{qp}= GIoU\left( p,q \right)$ (with $g=0$), respectively. The number of assigned predictions for each ground truth is variable but does not exceed an upper bound $k_{dynamic}$, which is set to $4$ in our experiment. Finally, there are $K_{pos}$ positive samples and $K-K_{pos}$ negative samples, where $K_{pos}$ ranges from $0$ to $Gk_{dynamic}$.
Given the ground truth label generated by the label assignment strategy for each prediction, we can conduct the loss function during training phase. As illustrated in Fig. \ref{head_assign}, $\mathcal{L}_{cls}^{o2o}$ and $\mathcal{L}_{rank}$ are for the O2O classification subhead, whereas $\mathcal{L}_{cls}^{o2m}$ is for the O2M classification subhead. Meanwhile, $\mathcal{L}_{GIOU}$ (with $g=1$), $\mathcal{L}_{end}$ and $\mathcal{L}_{aux}$ are designated for the O2M regression subhead. The gradiant from the O2O classification subhead to the RoI pooling layer is stopped to keep the quality of the feature learning. $\left( \hat{\theta}_{i,\cdot}^{seg},\hat{r}_{i,\cdot}^{seg} \right)$ is ingnored during evaluation.
\label{assign_appendix}
\section{The Supplement of Implement Detail and Visualization Results.}
Some important implement details for each dataset are shown in Table \ref{dataset_info}. It includes the dataset information we employed to conduct experiments and visualizations, the parameters for data processing as well as hyperparameters of Polar R-CNN.
Fig. \ref{vis_sparse} illustrates the visualization outcomes in sparse scenarios spanning four datasets. The top row depicts the ground truth, while the middle row shows the proposed lane anchors and the bottom row exhibits the predictions generated by Polar R-CNN with NMS-free paradigm. In the top and bottom row, different colors aim to distinguish different lane instances, which do not correspond across the images. From images of the middle row, we can see that LPH of Polar R-CNN effectively proposes anchors that are clustered around the ground truth, providing a robust prior for GPH to achieve the final lane predictions. Moreover, the number of anchors has significantly decreased compared to previous works, making our method faster than other anchor-based methods in theory.
Fig. \ref{vis_dense} shows the visualization outcomes in dense scenarios. The first column displays the ground truth, while the second and the third columns reveal the detection results with NMS paradigm of large (\textit{i.e.}, the default threshold NMS@50 with 50 pixels) and small (\textit{i.e.}, the optimal threshold NMS@15 with 15 pixels) NMS thresholds, respectively. The final column shows the detection results with NMS-free paradigm. We observe that NMS@50 mistakenly removes some predictions, leading to false negatives, while NMS@15 fails to eliminate some redundant predictions, leading to false positives. This underscores that the trade-off struggles between large and small NMS thresholds. The visualization distinctly demonstrates that distance becomes less effective in dense scenarios. Only the proposed O2O classification subhead, driven by data, can address this issue by capturing semantic distance beyond geometric distance. As shown in the last column of Fig. \ref{vis_dense}, the O2O classification subhead successfully eliminates redundant predictions while preserving dense predictions, despite their minimal geometric distances.
\label{vis_appendix}
\bibliographystyle{IEEEtran}
\bibliography{reference}
%\newpage
\end{appendices}
\end{document}

View File

@ -1,2 +1,3 @@
latexmk -C latexmk -C
latexmk -pdf main.tex latexmk -pdf main.tex
# latexmk -pdf main2.tex

Binary file not shown.

Before

Width:  |  Height:  |  Size: 86 KiB

After

Width:  |  Height:  |  Size: 94 KiB