This commit is contained in:
ShqWW 2024-10-28 18:55:12 +08:00
parent 47da11307a
commit 60c8960e60
13 changed files with 177 additions and 228 deletions

339
main.tex
View File

@ -48,7 +48,7 @@
\begin{abstract} \begin{abstract}
Lane detection is a critical and challenging task in autonomous driving, particularly in real-world scenarios where traffic lanes can be slender, lengthy, and often obscured by other vehicles, complicating detection efforts. Existing anchor-based methods typically rely on prior lane anchors to extract features and subsequently refine the location and shape of lanes. While these methods achieve high performance, manually setting prior anchors is cumbersome, and ensuring sufficient coverage across diverse datasets often requires a large amount of dense anchors. Furthermore, Lane detection is a critical and challenging task in autonomous driving, particularly in real-world scenarios where traffic lanes can be slender, lengthy, and often obscured by other vehicles, complicating detection efforts. Existing anchor-based methods typically rely on prior lane anchors to extract features and subsequently refine the location and shape of lanes. While these methods achieve high performance, manually setting prior anchors is cumbersome, and ensuring sufficient coverage across diverse datasets often requires a large amount of dense anchors. Furthermore,
the use of \textit{Non-Maximum Suppression} (NMS) to eliminate redundant predictions complicates real-world deployment and may underperform in complex scenarios. In this paper, we propose \textit{Polar R-CNN}, a NMS-free anchor-based method for lane detection. By incorporating both local and global polar coordinate systems, Polar R-CNN facilitates flexible anchor proposals and significantly reduces the number of anchors required without compromising performance. Additionally, we introduce a triplet head with heuristic structure that supports nms-free paradigm, enhancing deployment efficiency and performance in scenarios with dense lanes. Our method achieves competitive results on five popular lane detection benchmarks—\textit{Tusimple}, \textit{CULane}, \textit{LLAMAS}, \textit{CurveLanes}, and \textit{DL-Rail}—while maintaining a lightweight design and straightforward structure. Our source code is available at \href{https://github.com/ShqWW/PolarRCNN}{\textit{https://github.com/ShqWW/PolarRCNN}}. the use of \textit{Non-Maximum Suppression} (NMS) to eliminate redundant predictions complicates real-world deployment and may underperform in complex scenarios. In this paper, we propose \textit{Polar R-CNN}, a NMS-free anchor-based method for lane detection. By incorporating both local and global polar coordinate systems, Polar R-CNN facilitates flexible anchor proposals and significantly reduces the number of anchors required without compromising performance. Additionally, we introduce a triplet head with heuristic structure that supports NMS-free paradigm, enhancing deployment efficiency and performance in scenarios with dense lanes. Our method achieves competitive results on five popular lane detection benchmarks—\textit{Tusimple}, \textit{CULane}, \textit{LLAMAS}, \textit{CurveLanes}, and \textit{DL-Rail}—while maintaining a lightweight design and straightforward structure. Our source code is available at \href{https://github.com/ShqWW/PolarRCNN}{\textit{https://github.com/ShqWW/PolarRCNN}}.
\end{abstract} \end{abstract}
\begin{IEEEkeywords} \begin{IEEEkeywords}
Lane Detection, NMS-Free, Graph Neural Network, Polar Coordinate System. Lane Detection, NMS-Free, Graph Neural Network, Polar Coordinate System.
@ -111,7 +111,7 @@ In recent years, advancements in deep learning and the availability of large dat
\caption{} \caption{}
\end{subfigure} \end{subfigure}
\caption{Comparison of NMS thresholds in \textit{sparse} and \textit{dense} scenarios. (a) and (b) Ground truths in a dense and sparse scenarios, respectively. (c) Predictions with large NMS thresholds in a dense scenario, resulting in a lane prediction being mistakenly suppressed. (d) Predictions with small NMS thresholds in a sparse scenario, where redundant prediction results are not effectively removed.} \caption{Comparison of NMS thresholds in \textit{sparse} and \textit{dense} scenarios. (a) and (b) Ground truths in the dense and sparse scenarios, respectively. (c) Predictions with large NMS thresholds in a dense scenario, resulting in a lane prediction being mistakenly suppressed. (d) Predictions with small NMS thresholds in a sparse scenario, where redundant prediction results are not effectively removed.}
\label{NMS setting} \label{NMS setting}
\end{figure} \end{figure}
%, where some lane instances are close with each others; , where the lane instance are far apart %, where some lane instances are close with each others; , where the lane instance are far apart
@ -136,7 +136,7 @@ To address the above two issues, we propose Polar R-CNN, a novel anchor-based me
\begin{figure*}[ht] \begin{figure*}[ht]
\centering \centering
\includegraphics[width=0.99\linewidth]{thesis_figure/ovarall_architecture.png} \includegraphics[width=0.99\linewidth]{thesis_figure/ovarall_architecture.png}
\caption{An illustration of the Polar R-CNN architecture. It has a similar pipeline with the Faster R-CNN for the task of object detection, and consists of a backbone, a \textit{Feature Pyramid Network} with three levels of feature maps, respectively denote by $P_1, P_2, P_3$, followed by a \textit{Local Polar Module}, and a \textit{Global Polar Module} for lane detection. Based on the designed lane representation and lane anchor representation in polar coordinate system, the local polar module can propose sparse line anchors and the global polar module can produce the final accurate lane predictions. The global polar module includes a triplet head, which comprises a \textit{one-to-one (O2O)} classification head, a \textit{one-to-many} (O2M) classification head, and a \textit{one-to-many} (O2M) regression head.} \caption{An illustration of the Polar R-CNN architecture. It has a similar pipeline with the Faster R-CNN for the task of object detection, and consists of a backbone, a \textit{Feature Pyramid Network} with three levels of feature maps, respectively denote by $P_1, P_2, P_3$, followed by a \textit{Local Polar Module}, and a \textit{Global Polar Module} for lane detection. Based on the designed lane representation and lane anchor representation in polar coordinate system, the local polar module can propose sparse line anchors and the global polar module can produce the final accurate lane predictions. The global polar module includes a triplet head, which comprises the \textit{one-to-one (O2O)} classification subhead, the \textit{one-to-many} (O2M) classification subhead, and the \textit{one-to-many} (O2M) regression subhead.}
\label{overall_architecture} \label{overall_architecture}
\end{figure*} \end{figure*}
\section{Related Works} \section{Related Works}
@ -200,7 +200,7 @@ The local polar system is designed to predict lane anchors adaptable to both spa
%This one-to-many approach is essential for ensuring comprehensive anchor proposals, especially since some local features around certain poles may be lost due to damage or occlusion of the lane curve. %This one-to-many approach is essential for ensuring comprehensive anchor proposals, especially since some local features around certain poles may be lost due to damage or occlusion of the lane curve.
\par \par
In the local polar coordinate system, the parameters of each lane anchor are determined based on the location of its corresponding local pole. However, in practical terms, once a lane anchor is generated, its definitive position becomes immutable and independent of its original local pole. To simplify the representation of lane anchors in the second stage of Polar-RCNN, a global polar system has been designed, featuring a singular and unified pole that serves as a reference point for the entire image. The location of this global pole is manually set, and in this case, it is positioned near the static vanishing point observed across the entire lane image dataset. This approach ensures a consistent and unified polar coordinate for expressing lane anchors within the global context of the image, facilitating accurate regression to the ground truth lane instances. In the local polar coordinate system, the parameters of each lane anchor are determined based on the location of its corresponding local pole. However, in practical terms, once a lane anchor is generated, its definitive position becomes immutable and independent of its original local pole. To simplify the representation of lane anchors in the second stage of Polar-RCNN, a global polar system has been designed, featuring a singular and unified pole that serves as a reference point for the entire image. The location of this global pole is manually set, and in this case, it is positioned near the static \textit{vanishing point} observed across the entire lane image dataset\cite{Vpoint}. This approach ensures a consistent and unified polar coordinate for expressing lane anchors within the global context of the image, facilitating accurate regression to the ground truth lane instances.
\begin{figure}[t] \begin{figure}[t]
\centering \centering
@ -220,159 +220,119 @@ The downsampled feature map $\boldsymbol{F}_d$ is then fed into two branches: a
\end{align} \end{align}
The regression branch consists of a single $1\times1$ convolutional layer and with the goal of generating lane anchors by outputting their angles $\theta_j$ and the radius $r^{l}_{j}$, \textit{i.e.}, $\boldsymbol{F}_{reg\,\,} \equiv \left\{\theta_{j}, r^{l}_{j}\right\}_{j=1}^{H^{l}\times W^{l}}$, in the defined local polar coordinate system previously introduced. Similarly, the classification branch $\phi _{cls}^{l}\left(\cdot \right)$ only consists of two $1\times1$ convolutional layers for simplicity. This branch is to predict the confidence heat map $\boldsymbol{F}_{cls\,\,}\equiv \left\{ s_j^l \right\} _{j=1}^{H^l\times W^l}$ for local poles, each associated with a feature point. By discarding local poles with lower confidence, the module increases the likelihood of selecting potential positive foreground lane anchors while effectively removing background lane anchors. The regression branch consists of a single $1\times1$ convolutional layer and with the goal of generating lane anchors by outputting their angles $\theta_j$ and the radius $r^{l}_{j}$, \textit{i.e.}, $\boldsymbol{F}_{reg\,\,} \equiv \left\{\theta_{j}, r^{l}_{j}\right\}_{j=1}^{H^{l}\times W^{l}}$, in the defined local polar coordinate system previously introduced. Similarly, the classification branch $\phi _{cls}^{l}\left(\cdot \right)$ only consists of two $1\times1$ convolutional layers for simplicity. This branch is to predict the confidence heat map $\boldsymbol{F}_{cls\,\,}\equiv \left\{ s_j^l \right\} _{j=1}^{H^l\times W^l}$ for local poles, each associated with a feature point. By discarding local poles with lower confidence, the module increases the likelihood of selecting potential positive foreground lane anchors while effectively removing background lane anchors.
\par \par
\textbf{Loss Function for LPM.} To train the LPM, we define the ground truth labels for each local pole as follows: the ground truth radius, $\hat{r}^l_i$, is set to be the minimum distance from a local pole to the corresponding lane curve, while the ground truth angle, $\hat{\theta}_i$, is set to be the orientation of the vector extending from the local pole to the nearest point on the curve. Consequently, we have a label set of local poles $\hat{\boldsymbol{F}}_{cls}=\{\hat{s}_j^l\}_{j=1}^{H^l\times W^l}$, where $\hat{s}_j^l=1$ if the $j$-th local pole is positive and $\hat{s}_j^l=0$ if it is negative. Once the regression and classification labels are established, as shown in Fig. \ref{lpmlabel}, LPM can be trained using the $Smooth_{L1}$ loss $S_{L1}\left(\cdot \right)$ for regression branch and the \textit{binary cross-entropy} loss $BCE\left( \cdot , \cdot \right)$ for classification branch. The loss functions for LPM are given as follows: \textbf{Loss Function for LPM.} To train the LPM, we define the ground truth labels for each local pole as follows: the ground truth radius, $\hat{r}^l_i$, is set to be the minimum distance from a local pole to the corresponding lane curve, while the ground truth angle, $\hat{\theta}_i$, is set to be the orientation of the vector extending from the local pole to the nearest point on the curve. Consequently, we have a label set of local poles $\hat{\boldsymbol{F}}_{cls}=\{\hat{s}_j^l\}_{j=1}^{H^l\times W^l}$, where $\hat{s}_j^l=1$ if the $j$-th local pole is positive and $\hat{s}_j^l=0$ if it is negative. Once the regression and classification labels are established, as shown in Fig. \ref{lpmlabel}, LPM can be trained using the $Smooth_{L1}$ loss $S_{L1}\left(\cdot \right)$ for regression branch and the \textit{Binary Cross-Entropy} loss $BCE\left( \cdot , \cdot \right)$ for classification branch. The loss functions for LPM are given as follows:
\begin{align} \begin{align}
\mathcal{L} ^{l}_{cls}&=BCE\left( \boldsymbol{F}_{cls},\hat{\boldsymbol{F}}_{cls} \right)\\ \mathcal{L} ^{l}_{cls}&=BCE\left( \boldsymbol{F}_{cls},\hat{\boldsymbol{F}}_{cls} \right)\\
\mathcal{L} _{reg}^{l}&=\frac{1}{N_{pos}^{l}}\sum_{j\in \left\{ j|\hat{r}_{j}^{l}<\lambda^l \right\}}{\left( S_{L1}\left( \theta _{j}^{l}-\hat{\theta}_{j}^{l} \right) +S_{L1}\left( r_{j}^{l}-\hat{r}_{j}^{l} \right) \right)} \mathcal{L} _{reg}^{l}&=\frac{1}{N_{pos}^{l}}\sum_{j\in \left\{ j|\hat{r}_{j}^{l}<\lambda^l \right\}}{\left( S_{L1}\left( \theta _{j}^{l}-\hat{\theta}_{j}^{l} \right) +S_{L1}\left( r_{j}^{l}-\hat{r}_{j}^{l} \right) \right)}
\label{loss_lph} \label{loss_lph}
\end{align} \end{align}
where $N^{l}_{pos}=\left|\{j|\hat{r}_j^l<\tau^{l}\}\right|$ is the number of positive local poles in LPM. where $N^{l}_{pos}=\left|\{j|\hat{r}_j^l<\lambda^{l}\}\right|$ is the number of positive local poles in LPM.
\par \par
\textbf{Top-$K$ Anchor Selection.} As discussed above, all $H^{l}\times W^{l}$ anchors, each associated with a local pole in the feature map, are all considered as candidates during the training stage. However, some of these anchors serve as background anchors. We select $K$ anchors with the top-$K$ highest confidence scores as the foreground candidates to feed into the second stage (\textit{i.e.}, global polar module). During training, all anchors are chosen as candidates, where $K=H^{l}\times W^{l}$ assists it assists \textit{Global Polar Module} (the second stage) in learning from a diverse range of features, including various negative background anchor samples. Conversely, during the evaluation stage, some anchors with lower confidence can be excluded, where $K\leqslant H^{l}\times W^{l}$. This strategy effectively filters out potential negative anchors and reduces the computational complexity of the second stage. By doing so, it maintains the adaptability and flexibility of anchor distribution while decreasing the total number of anchors especially in the sparse scenarios. The following experiments will demonstrate the effectiveness of different top-$K$ anchor selection strategies. \textbf{Top-$K$ Anchor Selection.} As discussed above, all $H^{l}\times W^{l}$ anchors, each associated with a local pole in the feature map, are all considered as candidates during the training stage. However, some of these anchors serve as background anchors. We select $K$ anchors with the top-$K$ highest confidence scores as the foreground candidates to feed into the second stage (\textit{i.e.}, global polar module). During training, all anchors are chosen as candidates, where $K=H^{l}\times W^{l}$ assists it assists \textit{Global Polar Module} (the second stage) in learning from a diverse range of features, including various negative background anchor samples. Conversely, during the evaluation stage, some anchors with lower confidence can be excluded such that $K\leq H^{l}\times W^{l}$. This strategy effectively filters out potential negative anchors and reduces the computational complexity of the second stage. By doing so, it maintains the adaptability and flexibility of anchor distribution while decreasing the total number of anchors especially in the sparse scenarios. The following experiments will demonstrate the effectiveness of different top-$K$ anchor selection strategies.
\begin{figure}[t] \begin{figure}[t]
\centering \centering
\includegraphics[width=\linewidth]{thesis_figure/detection_head.png} \includegraphics[width=\linewidth]{thesis_figure/detection_head.png}
\caption{The primary pipeline of GPM integrates the RoI Pooling Layer with the triplet head. The triplet head comprises three components: the O2O classification head, the O2M classification head, and the O2M regression head. The O2O classification head serves as a replacement for NMS; the dashed path with ``$\times$'' indicates that NMS is no longer necessary. Both sets of $\left\{s_i^g\right\}$ and $\left\{\tilde{s}_i^g\right\}$ participate in the process of selecting the ultimate non-redundant outcomes, a procedure referred to as dual confidence selection. During the backward training phase, the gradients from the O2O classification head (the blue dashed route with ``$\times$'') are stopped.} \caption{The pipeline of GPM integrates a RoI Pooling Layer with a triplet head. The triplet head comprises three components: the O2O classification, the O2M classification, and the O2M regression. The dashed path with ``$\times$'' indicates that NMS is no longer necessary. Both sets of $\left\{s_i^g\right\}$ and $\left\{\tilde{s}_i^g\right\}$ participate in the process of selecting the ultimate non-redundant outcomes, a procedure referred to as dual confidence selection. During the backward training phase, the gradients from the O2O classification (the blue dashed route with ``$\times$'') are stopped.}
\label{gpm} \label{gpm}
\end{figure} \end{figure}
\subsection{Global Polar Module} \subsection{Global Polar Module}
We introduce a novel \textit{Global Polar Module} (GPM) as the second stage to achieve final lane prediction. As illustrated in Fig. \ref{overall_architecture}, GPM takes features samples from anchors proposed by LPM and provides the precise location and confidence scores of final lane detection results. The overall architecture of GPM is illustrated in the Fig. \ref{gpm}. We introduce a novel \textit{Global Polar Module} (GPM) as the second stage to achieve final lane prediction. As illustrated in Fig. \ref{overall_architecture}, GPM takes features samples from anchors proposed by LPM and provides the precise location and confidence scores of final lane detection results. The overall architecture of GPM is illustrated in the Fig. \ref{gpm}. As we can see from this figure, there are two components: a \textit{ROI pooling layer} and a \textit{triplet head}, which will be detailed as follows.
\par \par
\textbf{RoI Pooling Layer.} It is designed to extract sampled features from lane anchors. For ease of the sampling operation, we first transform the radius of the positive lane anchors in a local polar coordinate, $r_j^l$, into the equivalent in a global polar coordinate system, $r_j^g$, by the following equation: \textbf{RoI Pooling Layer.} It is designed to sample features for lane anchors from feature maps. For ease of the sampling operation, we first transform the radius of the positive lane anchors in a local polar coordinate, $r_j^l$, into the equivalent in a global polar coordinate system, $r_j^g$, by the following equation:
\begin{align} \begin{align}
r_{j}^{g}&=r_{j}^{l}+\left[ \cos \theta _j; \sin \theta _j \right] ^T\left( \mathbf{c}_{j}^{l}-\mathbf{c}^g \right), \label{l2g}\\ r_{j}^{g}&=r_{j}^{l}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\left( \boldsymbol{c}_{j}^{l}-\boldsymbol{c}^g \right), \label{l2g}\\
j &= 1, 2, \cdots, K, \notag j &= 1, 2, \cdots, K, \notag
\end{align} \end{align}
where $\boldsymbol{c}^{g} \in \mathbb{R}^{2}$ and $\boldsymbol{c}^{l}_{j} \in \mathbb{R}^{2}$ represent the Cartesian coordinates of the global pole and the $j$-th local pole, respectively. It is noteworthy that the angle $\theta_j$ remains unaltered, as the local and global polar coordinate systems share the same polar axis. And next, the feature points are sampled on each lane anchor as follows: where $\boldsymbol{c}^{g} \in \mathbb{R}^{2}$ and $\boldsymbol{c}^{l}_{j} \in \mathbb{R}^{2}$ represent the Cartesian coordinates of the global pole and the $j$-th local pole, respectively. It is noteworthy that the angle $\theta_j$ remains unaltered, as the local and global polar coordinate systems share the same polar axis. And next, the feature points are sampled on each lane anchor as follows:
\begin{align} \begin{align}
x_{i,j}^{s}&=-y_{i,j}^{s}\tan \theta _j+\frac{r_{j}^{g}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\boldsymbol{c}^g}{\cos \theta _j},\label{positions}\\ x_{i,j}^{s}&=-y_{i,j}^{s}\tan \theta _j+\frac{r_{j}^{g}+\left[ \cos \theta _j;\sin \theta _j \right] ^T\boldsymbol{c}^g}{\cos \theta _j},\label{positions}\\
i&=1,2,\cdots,N,\notag i&=1,2,\cdots,N;j=1,2,\cdots,K,\notag
\end{align} \end{align}
where the y-coordinates $\boldsymbol{y}_{j}^{s}\equiv \{y_{1,j}^s,y_{2,j}^s,\cdots ,y_{N,j}^s\}$ of the $j$-th lane anchor are uniformly sampled vertically from the image, as previously mentioned. The x-coordinates $\boldsymbol{x}_{j}^{s}\equiv \{x_{1,j}^s,x_{2,j}^s,\cdots ,x_{N,j}^s\}$ are then calculated by Eq. (\ref{positions}). The derivation of Eq. (\ref{l2g})-(\ref{positions}) can be found in Appendix \ref{appendix_coord}. where the y-coordinates $\boldsymbol{y}_{j}\equiv \{y_{1,j},y_{2,j},\cdots ,y_{N,j}\}$ of the $j$-th lane anchor are uniformly sampled vertically from the image, as previously mentioned. The proof of Eqs. (\ref{l2g})-(\ref{positions}) can be found in Appendix \ref{proof_l2g}. Then coordinates of the $j$-th lane anchor can be given by $\boldsymbol{\ell}_j=\{\boldsymbol{x}_{j},\boldsymbol{y}_j\}\equiv \left\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots ,(x_{N,j}, y_{N,j})\right\}$.
\par \par
Given the feature maps $\boldsymbol{P}_1, \boldsymbol{P}_2, \boldsymbol{P}_3$ from FPN, we can extract feature vectors corresponding to the positions of feature points $\{(x_{1,j}^s,y_{1,j}^s),(x_{2,j}^s,y_{2,j}^s),\cdots,(x_{N,j}^s,y_{N,j}^s)\}_{j=1}^{K}$, respectively denoted as $\boldsymbol{F}_{1,j}, \boldsymbol{F}_{2,j}, \boldsymbol{F}_{3,j}\in \mathbb{R} ^{N\times C_f}$. To enhance representation, similar to \cite{srlane}, we employ a weighted sum strategy to combine features from different levels as: Given the different level feature maps $\boldsymbol{P}_1, \boldsymbol{P}_2, \boldsymbol{P}_3$ from FPN, we can extract the channel-wise features of each point corresponding to the positions of $\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots,(x_{N,j},y_{N,j})\}_{j=1}^{K}$, respectively denoted as $\boldsymbol{F}_{1,j}, \boldsymbol{F}_{2,j}, \boldsymbol{F}_{3,j}\in \mathbb{R} ^{N\times C_f}$. To enhance representation, similar to \cite{srlane}, we employ a weighted sum strategy to combine features from the three levels by:
\begin{equation} \begin{equation}
\boldsymbol{F}^s_j=\sum_{k=1}^3{\frac{e^{\boldsymbol{w}_{k}}}{\sum_{k=1}^3{e^{\boldsymbol{w}_{k}}}}\circ \boldsymbol{F}_{k,j} }, \boldsymbol{F}^s_j=\sum_{k=1}^3{\frac{e^{\boldsymbol{w}_{k}}}{\sum_{k=1}^3{e^{\boldsymbol{w}_{k}}}}\circ \boldsymbol{F}_{k,j} },
\end{equation} \end{equation}
where $\boldsymbol{w}_{k}\in \mathbb{R}^{N}$ represents trainable aggregate weight ascribed to $N$ sampled points. The symbol ``$\circ$'' represents element-wise multiplication (\textit{i.e.}, Hadamard product). Instead of concatenating the three sampling features into $\boldsymbol{F}^s_j\in \mathbb{R} ^{N\times 3C_f}$ directly, the adaptive summation significantly reduces the feature dimensions to $\boldsymbol{F}^s_j\in \mathbb{R} ^{N\times C_f}$, which is one-third of the initial dimension. The weighted sum of the tensors is flattened into a vector $\bar{\boldsymbol{F}}^s_j\in \mathbb{R} ^{NC_f}$, and then subjected to a linear transformation: where $\boldsymbol{w}_{k}\in \mathbb{R}^{N}$ represents trainable aggregate weight ascribed to $N$ sampled points, and the symbol ``$\circ$'' represents element-wise multiplication (\textit{i.e.}, Hadamard product). Instead of concatenating the three sampling features into $\boldsymbol{F}^s_j\in \mathbb{R} ^{N\times 3C_f}$ directly, the adaptive summation significantly reduces the feature dimensions to $\boldsymbol{F}^s_j\in \mathbb{R} ^{N\times C_f}$, which is one-third of the initial dimension. The weighted sum of the tensors is flattened into a vector $\widehat{\boldsymbol{F}}^s_j\in \mathbb{R} ^{NC_f}$, and then integrated through a linear transformation:
\begin{align} \begin{align}
\boldsymbol{F}_{j}^{roi}&\gets \boldsymbol{W}_{pool}\bar{\boldsymbol{F}}_{j}^{s},\\ \boldsymbol{F}_{j}^{roi}\gets \boldsymbol{W}_{pool}\widehat{\boldsymbol{F}}_{j}^{s},\quad j=1,2,\cdots,K\notag.
j&=1,2,\cdots,K,\notag.
\end{align} \end{align}
Here, $\boldsymbol{W}_{pool}\in \mathbb{R} ^{d_r\times NC_f}$ is employed to reduce the dimension of $\bar{\boldsymbol{F}}_{j}^{s}$, thereby yielding the final RoI feature $\boldsymbol{F}_{j}^{roi}\in \mathbb{R} ^{d_r}$, where $d_r\ll NC_f$. Here $\boldsymbol{W}_{pool}\in \mathbb{R} ^{d_r\times NC_f}$ is employed to further reduce the dimension of integrated feature $\widehat{\boldsymbol{F}}_{j}^{s}$, thereby yielding the final RoI features $\{\boldsymbol{F}_{i}^{roi}\in \mathbb{R} ^{d_r}\}_{i=1}^K$, which are feed to the following triplet head.
\par
\textbf{Triplet Head.} With the $\left\{ \boldsymbol{F}_{i}^{roi} \right\} _{i=1}^{K}$ as input of the Triplet Head, it encompasses three distinct components: the one-to-one (O2O) classification head, the one-to-many (O2M) classification head, and the one-to-many (O2M) regression head, as depicted in Fig. \ref{gpm}. To attain optimal non-redundant detection outcomes within a NMS-free paradigm (\textit{i.e.}, end-to-end detection), both the one-to-one and one-to-many label assignments become essential during the training stage, as underscored in \cite{o2o}. Drawing inspiration from \cite{o3d}\cite{pss} but with subtle variations, we architect the triplet head to achieve a NMS-free paradigm. \textbf{Triplet Head.} The lane detection head is to classify and regress the lane anchors generated from the LPM based on the ROI pooling features $\{\boldsymbol{F}_{i}^{roi}\in \mathbb{R} ^{d_r}\}_{i=1}^K$. As we know, traditional lane detection head\cite{laneatt} is usually equipped with a \textit{One-to-Many} (O2M) classification subhead and a \textit{One-to-Many} (O2M) regression subhead. However, the one-to-many mechanism (\textit{i.e.}, \textit{many candidates for one ground truth}) will cause redundant predictions for each lane, thus need the NMS post-processing operator. While the NMS is non-differentiable and non-end-to-end, resulting in the challenges of manually setting of hyperparameters and suboptimal of performance. To eliminate NMS post-processing while achieving end-to-end learning, we introduce a triplet head module for lane detection.
%In numerous studies \cite{laneatt}\cite{clrnet}\cite{adnet}\cite{srlane}, the detection head predominantly adheres to the one-to-many paradigm. During the training phase, multiple positive samples are assigned to a single ground truth. Consequently, during the evaluation phase, redundant detection outcomes are frequently predicted for each instance. These redundancies are conventionally mitigated using Non-Maximum Suppression (NMS), which eradicates duplicate results. Nevertheless, NMS relies on the definition of the geometric distance between detection results, rendering this calculation intricate for curvilinear lanes. Moreover, NMS post-processing introduces challenges in balancing recall and precision, a concern highlighted in our previous analysis. \par
%As illustrated in Fig. \ref{gpm}, it is important to note that the detection process of the O2O classification head is not independent; rather, the confidence $\left\{ \tilde{s}_i^g \right\}$ output by the O2O classificatoin head relies upon the confidence $\left\{ s_i^g \right\} $ output by the O2M classification head. As shown in Fig. \ref{gpm}, the triplet head consists of three components: the O2M classification, the O2M regression, and another \textit{One-to-One} (O2O) classification. The features of each lane anchor $\{\boldsymbol{F}_{j}^{roi}\}$ are feed into three subheads, respectively. To keep both simplicity and efficiency, both the O2M classification and O2M regression subheads apply two \textit{multi-layer perceptions} (MLPs) to $\{\boldsymbol{F}_{j}^{roi}\}$ and then generate the confidence scores $\left\{{s}_j^g\right\}$ by the O2M classification subhead and the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ by the O2M regression subhead for each lane anchor. More details about the O2M classification and O2M regression subheads can be referred to \cite{yolox}. The O2O classification subhead is introduced to generate non-redundant lane candidates within a NMS-free paradigm. However, the direct use of one-to-one strategy (\textit{i.e.}, \textit{assigning one positive anchor for one ground truth lane}) based on the extracted features will damage model's learning\cite{dualassign}\cite{yolov10}. Instead, the proposed O2O classification subhead considers both the \textit{confidence prior} $\left\{{s}_j^g\right\}$ of O2M classification subhead and the \textit{spatial geometric prior} of the polar parameters (\textit{i.e.}, the angle $\theta$ and the radius $r$), and apply these priors to adaptively refine the lane anchor features $\{\boldsymbol{F}_{j}^{roi}\}$, which generates the refined lane anchor features $\{\boldsymbol{D}_{j}^{roi}\}$ and the confidence scores $\left\{\tilde{s}_j^g\right\}$. The structural design draws inspiration from the Fast NMS \cite{yolact}, with further particulars accessible in the Appendix \ref{NMS_appendix}.
\begin{figure}[t] \par
\centering More specifically, the O2O classification subhead first calculates the \textit{semantic distance} between the $i$-th anchor with its x-coordinate $\boldsymbol{x}_{i}$ and the $j$-th anchor with its x-coordinate $\boldsymbol{x}_{j}$ as follows:
\includegraphics[width=0.9\linewidth]{thesis_figure/gnn.png} % 替换为你的图片文件名 \begin{align}
\caption{An example of the graph construction in O2O classification head. In the illustration, the elements $A_{12}$, $A_{32}$ and $A_{54}$ are equal to $1$ in the adjacent matrix $\boldsymbol{A}$, thereby indicating the presence of directed edges between the respective node pairs (\textit{i.e.}, $1\rightarrow2$, $3\rightarrow2$ and $5\rightarrow4$). This implies that the detection result $2$ may be potentially suppressed by $1$ and $3$, whereas detection result $4$ may be potentially suppressed by $5$.} \widehat{\boldsymbol{F}}_{i}^{roi}&\gets \mathrm{ReLU}\left( \boldsymbol{W}_{roi}\boldsymbol{F}_{i}^{roi}+\boldsymbol{b}_{roi} \right), i=1,\cdots,K,\label{edge_layer_1}\\
\label{graph} \boldsymbol{F}_{ij}^{edge}&\gets \boldsymbol{W}_{in}\widehat{\boldsymbol{F}}_{j}^{roi}-\boldsymbol{W}_{out}\widehat{\boldsymbol{F}}_{i}^{roi},\label{edge_layer_2}\\
\end{figure} \boldsymbol{D}_{ij}^{edge}&\gets \mathrm{MLP}_{edge}\left(\boldsymbol{F}_{ij}^{edge}+\boldsymbol{W}_s\left( \boldsymbol{x}_{j}-\boldsymbol{x}_{i} \right) +\boldsymbol{b}_s \right),\label{edge_layer_3}
\end{align}
To ensure both simplicity and efficiency in our model, the O2M regression head and the O2M classification head are architected with a straightforward design with \textcolor{red}{two-layer Multi-Layer Perceptrons (MLPs)}. To facilitate the models transition to a NMS-free paradigm, we have developed an extended O2O classification head. In this section, we focus on elaborating the structure of the O2O classification head. The comprehensive details of the structure design can be located in Appendix \ref{NMS_appendix}; where $\boldsymbol{D}_{ij}^{edge}\in \mathbb{R}^{d_n}$ denotes the implicit semantic distance between the $i$-th prediction and the $j$-th predictions. $\mathrm{ReLU}$ is the ReLU activation function, $\mathrm{MLP}_{edge}$ denotes a two-layer MLP operator, and $\{\boldsymbol{W}_{roi}, \boldsymbol{W}_{in}, \boldsymbol{W}_{out}, \boldsymbol{W}_s, \boldsymbol{b}_{roi}, \boldsymbol{b}_s\}$ are the model parameters to be trained. However, it is still difficult to make a one-to-one assignment for the ground truth instance based on above semantic distance, as some forked anchors may have similar distances. To increase the semantic distance gaps among anchors, we need to suppress the features of similar or overlapped anchors. Based on this, we have designed a adjacency matrix $\boldsymbol{A}\in\mathbb{R}^{K\times K}$ defined as
\begin{equation}
Disregarding the intricate details, the fundamental prerequisites of Fast NMS is as follows. The detection result A is suppressed by another detection result B if: \boldsymbol{A}=\boldsymbol{A}^C\odot\boldsymbol{A}^G,
\begin{itemize} \end{equation}
\item (1) The confidence score of B exceeds that of A; where $\odot$ is the element-wise multiplication, $\boldsymbol{A}^C\in\mathbb{R}^{K\times K}$ and $\boldsymbol{A}^G\in\mathbb{R}^{K\times K}$ are the confidence-prior adjacency matrix and the geometric-prior adjacency matrix, respectively. The confidence-prior adjacency matrix $\boldsymbol{A}^C=\left(A_{ij}^C\right)_{i,j=1}^K$ is defined as follows:
\item (2) The distance between A and B is less than a predefined threshold.
\end{itemize}
According to the two above conditions, we can construct a relation graph between anchors, as illustrated in the Fig. \ref{graph}.
In a graph, the essential components consist of nodes and edges. We have constructed a directed graph as follows. Each anchor is conceptualized as a node, with the ROI features $\boldsymbol{F}_{i}^{roi}$ serving as the input features (\textit{i.e.}, initial signals) of these nodes. Directed edges between nodes are represented by the adjacent matrix $\boldsymbol{A}\in\mathbb{R}^{K\times K}$.
Specifically, if one element $A_{ij}$ in $\boldsymbol{A}$ equals $1$, a directed edge exists from the $i$-th node and $j$-th node, which implies that the $j$-th prediction may be suppressed by the $i$-th prediction. The existence of an edge is determined by two matrices corresponding to the above two conditions in Fast NMS.
The first matrix is the confidence comparison matrix $\boldsymbol{A}^{C}\in\mathbb{R}^{K\times K}$, which is defined as follows:
\begin{align} \begin{align}
A_{ij}^{C}=\begin{cases} A_{ij}^{C}=\begin{cases}
1,\,\,s_i^g>s_j^g\,\,or\,\,\left( s_i^g=s_j^g\,\,and\,\,i>j \right)\\ 1,\, if \,s_i^g>s_j^g\,\,or\,\,( s_i^g=s_j^g\,\,and\,\,i>j );\\
0,\,\,others. 0,\,\,others.
\end{cases} \end{cases}
\label{confidential matrix} \label{confidential matrix1}
\end{align} \end{align}
This matrix facilitates the comparison of scores for each pair of anchors. The edge from the $i$-th and $j$-th nodes exists \textit{only if} the two anchors satisfy the new condition (accounting the situation involving two equal confidence scores) derived from condition (1). Here, $s_i^g$ and $s_j^g$ are the confidence scores corresponding to the $i$-th and the $j$-th lane anchors and predicted by the O2M classification subhead. According to Eq. \eqref{confidential matrix1}, the role of $\boldsymbol{A}^C$ is to allow lane anchors with higher confidence scores to suppress those with lower scores. In order to leverage geometric priors and based on the representation in polar coordinate (\textit{i.e}, the global polar radius $r^g$ and angle $\theta$), we further introduce geometric-prior adjacency matrix $\boldsymbol{A}^G=\left(A_{ij}^G\right)_{i,j=1}^K$, defined by
The second matrix is the geometric prior matrix, denoted as $\boldsymbol{A}^{G}\in\mathbb{R}^{K\times K}$:
\begin{align} \begin{align}
A_{ij}^{G}=\begin{cases} A_{ij}^{G}=\begin{cases}
1,\,\,\left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\tau^r\\ 1,\, if \,\left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\lambda^g;\\
0,\,\,others. 0,\,\,others,
\end{cases} \end{cases}
\label{geometric prior matrix} \label{geometric prior matrix1}
\end{align} \end{align}
This matrix indicates that an edge is considered to exist between two nodes \textit{only if} the two corresponding anchors are sufficiently close to each other. The distance between anchors is characterized by their global polar parameters. This criterion, which takes into account the distance between anchors, introduces a slight variation of condition (2), which accounts for the distance of detection outcomes. where $\tau^{\theta}$ and $\lambda^g$ are the thresholds to measure the geometric distances. Based on the definition of geometric-prior and confidence-prior adjacency matrices, the overall adjacency matrix $\boldsymbol{A}$ can be seen as a directed graph with each lane anchor as a node and the ROI features $\boldsymbol{F}_i^{roi}$ serving as their input features. Specifically, if an element $A_{ij}$ in $\boldsymbol{A}$ equals to 1, a directed edge exists from the $i$-th anchor and the $j$-th anchor, which implies that the $j$-th anchor may be suppressed by the $i$-th anchor when the confidence score of the $i$-th anchor exceeds that of the $j$-th anchor and their geometric distance is sufficiently small (\textit{i.e.}, less that a predefined threshold).
\par
With the aforementioned two matrices, the overall adjacency matrix is formulated as $\boldsymbol{A} = \boldsymbol{A}^{C} \odot \boldsymbol{A}^{G}$; where ``$\odot$'' signifies the element-wise multiplication. Though we have constructed the suppressing relation graph of each pair of anchors, the distance still remains undefined. In fast NMS, the distance is delineated by geometric properties of the detection results, constraining the model's performance in dense scenarios as we analyzed before. Some forked lanes or dashed lanes have a small geometric distance, which may cause a difficulty in the trade-off of predictions distance. So we replace the geometric distance with the high-dimension semantic distance. The semantic distance is formulated by the graph neural network, which is data-driven. Consequently, the semantic distance between the $i$-th anchor and the $j$-th anchor can be modeled as follows: And then, by considering the suppressive effect of the lane anchors induced by the overall adjacency matrix $\boldsymbol{A}$, the lane anchor features $\boldsymbol{F}_j^{roi}$ can be further refined from the semantic distance tensor $\mathcal{D}^{edge}=\{\boldsymbol{D}_{ij}^{edge}\}\in\mathbb{R}^{K\times K\times d_n}$ as follows:
\begin{align} \begin{align}
\tilde{\boldsymbol{F}}_{i}^{roi}&\gets \mathrm{ReLU}\left( \boldsymbol{W}_{roi}\boldsymbol{F}_{i}^{roi}+\boldsymbol{b}_{roi} \right),\label{edge_layer_1}\\ \boldsymbol{D}_j^{roi}\in \mathbb{R}^{d_n}\gets\mathrm{MPool_{col}}\left(\mathcal{D}^{edge}(:,j,:)|\boldsymbol{A}(:,j)=1\right),
\boldsymbol{F}_{ij}^{edge}&\gets \boldsymbol{W}_{in}\tilde{\boldsymbol{F}}_{j}^{roi}-\boldsymbol{W}_{out}\tilde{\boldsymbol{F}}_{i}^{roi},\label{edge_layer_2}\\ \label{maxpooling}
\tilde{\boldsymbol{F}}_{ij}^{edge}&\gets \boldsymbol{F}_{ij}^{edge}+\boldsymbol{W}_s\left( \boldsymbol{x}_{j}^{s}-\boldsymbol{x}_{i}^{s} \right) +\boldsymbol{b}_s,\label{edge_layer_3}\\
\boldsymbol{D}_{ij}^{edge}&\gets \mathrm{MLP}_{edge}\left( \tilde{\boldsymbol{F}}_{ij}^{edge} \right).\label{edge_layer_4}
\end{align} \end{align}
Eq. (\ref{edge_layer_1})-(\ref{edge_layer_4}) calculate the semantic distance $\boldsymbol{D}_{ij}^{edge}\in \mathbb{R}^{d_n}$ from the $i$-th node and the $j$-th node corresponding to the edge $E_{i\rightarrow j}$ with a directional characteristic. With the directed semantic distances provided for linked node pairs, we employ an element-wise max pooling layer to aggregate all the \textit{incoming edges} of a node to refine its node features to $\boldsymbol{D}_{i}^{node}\in \mathbb{R}^{d_n}$: where $j=1,2,\cdots,K$ and $\mathrm{MPool_{col}}(\cdot|\boldsymbol{A}(:,j)=1)$ is an element-wise max pooling operator along the $j$-th column of adjacency matrix $\boldsymbol{A}$ with the element $A_{:j}=1$. This is in inspired by the existing works\cite{o3d}\cite{pointnet}, which aims to extract the most distinctive features from the lane anchors that may potentially suppress the refined lane anchors. With the refined anchor features $\boldsymbol{D}_j^{roi}$, the final confidence scores of the O2O classification subhead are generated by a three-layer MLPs:
\begin{align} \begin{align}
D_{i,m}^{node}&\gets {\max}\,D_{ki,m}^{edge}, \\ \tilde{s}_{j}^{g}\gets \mathrm{MLP}_{roi}\left( \boldsymbol{D}_{j}^{roi} \right), j=1,\cdots,K. \label{node_layer}
m&=1,2,\cdots,d_n,\notag
\end{align} \end{align}
where $D_{i,m}^{node}$ and $D_{ki,m}^{edge}$ are the $m$-th elements of $\boldsymbol{D}_{i}^{node}$ and $\boldsymbol{D}_{ki}^{edge}$, respectively. And Additionally, $k$ is an element of set $\left\{ k|A_{ki}=1 \right\}$. In this context, drawing inspiration from by \cite{o3d}\cite{pointnet}, the max pooling aims to extract the most distinctive features alone the column of the adjacent matrix (\textit{i.e.}, the set of the incoming nodes that may potentially suppress the refined node). With the refined node features, the ultimate confidence scores $\tilde{s}_{i}^{g}$ are generated by the subsequent layers: As stated above, the O2O classification subhead is formed from Eqs. (\ref{edge_layer_1})-(\ref{node_layer}), which can be seen as a directed graph driven by neural networks.
\par
\textbf{Dual Confidence Selection With NMF-free.} With the help of adjacency matrix $A$, the variability among semantic features $\{\boldsymbol{D}_j^{roi}\}$ has been enlarged, resulting in a significant gap in confidence scores $\{\tilde{s}_{j}^{g}\}$ generated by O2O classification subhead, which makes them easier to distinguish. Therefore, unlike conventional methods that feed the confidence scores $\{\tilde{s}_{j}^{g}\}$ obtained by O2M classification subhead into the NMS post-processing stage to remove redundant candidates, we have implemented the following dual confidence selection criterion for selecting positive anchors:
\begin{align} \begin{align}
\boldsymbol{F}_{i}^{node}&\gets \mathrm{MLP}_{node}\left( \boldsymbol{D}_{i}^{node} \right) , \Omega^{pos}=\left\{i|\tilde{s}_{i}^{g}>\tau_{o2o} \right\} \cap \left\{ i|s_{i}^{g}>\tau_{o2m} \right\}
\\
\tilde{s}_{i}^{g}&\gets \sigma \left( \boldsymbol{W}_{node}\boldsymbol{F}_{i}^{node} + \boldsymbol{b}_{node} \right) ,
\label{node_layer}
\end{align} \end{align}
Equations (\ref{edge_layer_1})-(\ref{node_layer}) are referred to as the newly proposed \textit{graph neural network} (GNN) in our study, which serves as the structural foundation of the O2O classification head, replacing the traditional NMS post-processing. where $\tau_{o2o}$ and $\tau_{o2m}$ are two confidence thresholds. The $\Omega^{pos}$ can allow for non-redundant positive predictions without NMS post-processing as the O2O classification subhead enhances the confidence score variability among similar anchors, making it less sensitive to the two confidence thresholds.
\par
\textbf{Dual Confidence Selection.} Within the conventional NMS framework, the predictions emanating from the O2M classification heads with confidences $\left\{ s_{i}^{g} \right\} $ surpassing $\lambda_{o2m}^s$ are designated as positive candidates. They are subsequently fed into the NMS post-processing stage to remove redundant predictions. In the NMS-free paradigm of our work, the final non-redundant predictions are selected through the following certerion: \textbf{Loss function for GPM.} After obtaining the positive candidate set $\Omega^{pos}$ for the O2O classification subhead, the Hungarian algorithm \cite{detr} is applied to perform label assignment, \textit{i.e.}, a one-to-one assignment between the positive anchors and the ground truth instances. As for the O2M classification and O2M regression subheads, we use the same approach as in SimOTA \cite{yolox} for label assignment. More details about label assignment can be found in Appendix \ref{assign_appendix}. In the training, the Focal loss \cite{focal} is applied for both O2O classification subhead and the O2M classification subhead, respectively denoted as $\mathcal{L}^{o2o}_{cls}$ and $\mathcal{L}^{o2m}_{cls}$. Furthermore, we adopt the rank loss $\mathcal{L}_{rank}$ \cite{pss} to amplify the disparity between the positive and negative confidences of the O2O classification subhead. Note that, similar to \cite{pss}, we stop the gradient flow from the O2O classification subhead during the training stage to preserve the quality of RoI feature learning. To train the O2M regression subhead
\begin{align} we have redefined the GIoU concept (refer to Appendix \ref{giou_appendix} for more details) and adopt the GIoU loss $\mathcal{L}_{GIoU}^{o2m}$ to regress the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ for each positive lane anchor. The end points of lanes are trained with a $Smooth_{L1}$ loss $\mathcal{L}_{end}^{o2m}$. In addition, we propose an auxiliary loss $\mathcal{L}_{aux}$ to facilitate the learning of global features. As illustrated in Fig. \ref{auxloss}, the anchors and ground truth are divided into several segments, with each anchor segment being regressed to the primary components of the corresponding segment of the ground truth. The auxiliary loss $\mathcal{L}_{aux}$ helps the detection head gain a deeper understanding of the global geometric structure. Finally, the classification loss $\mathcal{L} _{cls}^{g}$ and the regression loss $\mathcal{L} _{reg}^{g}$ for GPM are given as follows:
\varOmega _{o2o}^{pos}\equiv \left\{ i|\tilde{s}_{i}^{g}>\lambda _{o2o}^{s} \right\} \cap \left\{ i|s_{i}^{g}>\lambda _{o2m}^{s} \right\}.
\end{align}
We employ dual confidence thresholds, denoted as $\lambda_{o2m}^s$ and $\lambda_{o2o}^s$, to select the final non-redundant positives predictions. $\varOmega _{o2o}^{pos}$ signifies the ultimate collection of non-redundant predictions, wherein both confidences satisfy the aforementioned conditions in conjunction with the dual confidence thresholds. This methodology of selecting non-redundant predictions is termed \textit{dual confidence selection}.
% \textbf{Label Assignment and Cost Function for GPM.} As the previous work \cite{o3d}\cite{pss}, we use the dual assignment strategy for label assignment of triplet head. The cost function for the $i$-th prediction and $j$-th ground truth is given as follows:
% \begin{align}
% \mathcal{C} _{ij}^{o2m}&=s_i^g\times \left( GIoU_{lane, \,ij} \right) ^{\beta},\\
% \mathcal{C} _{ij}^{o2o}&=\tilde{s}_i^g\times \left( GIoU_{lane, \,ij} \right) ^{\beta},
% \end{align}
% where $\mathcal{C} _{ij}^{o2m}$ is the cost function for the O2M classification and regression head while $\mathcal{C} _{ij}^{o2o}$ for O2O classification head, with $\beta$ serving as the trade-off hyperparameter for location and confidence. This cost function is more compact than that in previous works\cite{clrnet}\cite{adnet}, considering both location and confidence into account. We have redefined IoU function between lane instances: $GIOU_{lane}$, which differs slightly from previous work \cite{clrernet}. More details about $GIOU_{lane}$ can be found in the Appendix \ref{giou_appendix}.
\textbf{Loss function for GPM.}
We use SimOTA \cite{yolox} (one-to-many assignment) for the O2M classification head and the O2M regression head while Hungarian \cite{detr} algorithm (one-to-one assignment) for the O2O classification head. More details about the label assignment can be found in Appendix \ref{giou_appendix}. Focal loss \cite{focal} is utilized for both O2O classification head and the O2M classification head, dentoed as $\mathcal{L}^{o2m}_{cls}$ and $\mathcal{L}^{o2o}_{cls}$, respectively. The set of candidate samples involved in the computation of $\mathcal{L}^{o2o}_{cls}$, denoted as $\varOmega_{o2o}$, is confined to the positive sample set of the O2M classification head:
\begin{align}
\varOmega _{o2o}=\left\{ i\mid s_i^g>\lambda_{o2m}^s \right\}.
\end{align}
In essence, certain samples with lower $\left\{ s_{i}^{g} \right\} $ are excluded from the computation of $\mathcal{L}^{o2o}_{cls}$. Furthermore, we harness the rank loss $\mathcal{L} _{rank}$ as referenced in \cite{pss} to amplify the disparity between the positive and negative confidences of the O2O classification head. Given the disparity between the label assignments of the O2O classification head and the O2M classification head, to preserve the quality of RoI feature learning, the gradient is stopped from the O2O classification head during the training process. This technique is also utilized in \cite{pss}.
\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{thesis_figure/auxloss.png} %
\caption{Auxiliary loss for segment parameter regression. The ground truth of a lane curve is partitioned into several segments, with the parameters of each segment denoted as $\left( \hat{\theta}_{i,\cdot}^{seg},\hat{r}_{i,\cdot}^{seg} \right)$. The model output the parameter offsets $\left( \varDelta \theta _{j,\cdot},\varDelta r_{j,\cdot}^{g} \right)$ to regress from the original anchor to each target line segments.}
\label{auxloss}
\end{figure}
We directly apply the redefined GIoU loss (refer to Appendix \ref{giou_appendix}), $\mathcal{L}_{GIoU}$, to regress the offset of x-axis coordinates of sampled points and $Smooth_{L1}$ loss for the regression of end points of lanes, denoted as $\mathcal{L}_{end}$.
To facilitate the learning of global features, we propose the auxiliary loss $\mathcal{L}_{aux}$ depicted in Fig. \ref{auxloss}. The anchors and ground truth are segmented into several divisions. Each anchor segment is regressed to the primary components of the corresponding segment of the designated ground truth. This approach aids the detection head in acquiring a deeper comprehension of the global geometric form.
The final loss functions for GPM are given as follows:
\begin{align} \begin{align}
\mathcal{L} _{cls}^{g}&=w^{o2m}_{cls}\mathcal{L}^{o2m}_{cls}+w^{o2o}_{cls}\mathcal{L}^{o2o}_{cls}+w_{rank}\mathcal{L}_{rank}, \mathcal{L} _{cls}^{g}&=w^{o2m}_{cls}\mathcal{L}^{o2m}_{cls}+w^{o2o}_{cls}\mathcal{L}^{o2o}_{cls}+w_{rank}\mathcal{L}_{rank},
\\ \\
\mathcal{L} _{reg}^{g}&=w_{GIoU}\mathcal{L}_{GIoU}+w_{end}\mathcal{L}_{end}+w_{aux}\mathcal{L} _{aux}. \mathcal{L} _{reg}^{g}&=w_{GIoU}^{o2m}\mathcal{L}_{GIoU}^{o2m}+w_{end}^{o2m}\mathcal{L}_{end}^{o2m}+w_{aux}\mathcal{L} _{aux},
\end{align} \end{align}
% \begin{align} where $w^{o2m}_{cls}, w^{o2o}_{cls}, w_{rank}, w_{GIoU}^{o2m}, w_{end}^{o2m}, w_{aux}$ are constant weights used for adjusting the effect of the different loss terms.
% \mathcal{L}_{aux} &= \frac{1}{\left| \varOmega^{pos}_{o2m} \right| N_{seg}} \sum_{i \in \varOmega_{pos}^{o2o}} \sum_{m=j}^k \Bigg[ l \left( \theta_i - \hat{\theta}_{i}^{seg,m} \right) \\ \subsection{The Overall Loss Function}
% &\quad + l \left( r_{i}^{g} - \hat{r}_{i}^{seg,m} \right) \Bigg]. By taking the two stage losses, the overall loss function is given as follows:
% \end{align}
\subsection{The Overall Loss Function.} The entire training process is orchestrated in an end-to-end manner, wherein both LPM and GPM are trained concurrently. The overall loss function is delineated as follows:
\begin{align} \begin{align}
\mathcal{L} =\mathcal{L} _{cls}^{l}+\mathcal{L} _{reg}^{l}+\mathcal{L} _{cls}^{g}+\mathcal{L} _{reg}^{g}. \mathcal{L} =\mathcal{L} _{cls}^{l}+\mathcal{L} _{reg}^{l}+\mathcal{L} _{cls}^{g}+\mathcal{L} _{reg}^{g},
\end{align} \end{align}
where $\mathcal{L} _{cls}^{l}, \mathcal{L} _{reg}^{l}$ are used to train parameters of classification and regression of LPM in the first stage and $\mathcal{L} _{cls}^{g},\mathcal{L} _{reg}^{g}$ are used to train that of GPM in the second stage.
%
\begin{figure}[t]
\centering
\includegraphics[width=0.85\linewidth]{thesis_figure/auxloss.png} %
\caption{Auxiliary loss for segment parameter regression. The ground truth lane curve is partitioned into several segments, with the parameters of each segment denoted as $\left( \hat{\theta}_{i,\cdot}^{seg},\hat{r}_{i,\cdot}^{seg} \right)$. The model output the parameter offsets $\left( \varDelta \theta _{j,\cdot},\varDelta r_{j,\cdot}^{g} \right)$ to regress from the original anchor to each target line segments.}
\label{auxloss}
\end{figure}
%
\section{Experiment} \section{Experiment}
\subsection{Dataset and Evaluation Metric} \subsection{Dataset and Evaluation Metric}
We conducted experiments on four widely used lane detection benchmarks and one rail detection dataset: CULane\cite{scnn}, TuSimple\cite{tusimple}, LLAMAS\cite{llamas}, CurveLanes\cite{curvelanes}, and DL-Rail\cite{dalnet}. Among these datasets, CULane and CurveLanes are particularly challenging. The CULane dataset consists various scenarios but has sparse lane distributions, whereas CurveLanes includes a large number of curved and dense lane types, such as forked and double lanes. The DL-Rail dataset, focused on rail detection across different scenarios, is chosen to evaluate our models performance beyond traditional lane detection. We conducted experiments on four widely used lane detection benchmarks and one rail detection dataset: CULane\cite{scnn}, TuSimple\cite{tusimple}, LLAMAS\cite{llamas}, CurveLanes\cite{curvelanes}, and DL-Rail\cite{dalnet}. Among these datasets, CULane and CurveLanes are particularly challenging. The CULane dataset consists various scenarios but has sparse lane distributions, whereas CurveLanes includes a large number of curved and dense lane types, such as forked and double lanes. The DL-Rail dataset, focused on rail detection across different scenarios, is chosen to evaluate our models performance beyond traditional lane detection.
@ -397,12 +357,12 @@ For Tusimple, the evaluation is formulated as follows:
where $C_{clip}$ and $S_{clip}$ represent the number of correct points (predicted points within 20 pixels of the ground truth) and the ground truth points, respectively. If the accuracy exceeds 85\%, the prediction is considered correct. TuSimples also report the False Positive Rate ($\mathrm{FPR}=1-\mathrm{Precision}$) and False Negative Rate ($\mathrm{FNR}=1-\mathrm{Recall}$) formular. where $C_{clip}$ and $S_{clip}$ represent the number of correct points (predicted points within 20 pixels of the ground truth) and the ground truth points, respectively. If the accuracy exceeds 85\%, the prediction is considered correct. TuSimples also report the False Positive Rate ($\mathrm{FPR}=1-\mathrm{Precision}$) and False Negative Rate ($\mathrm{FNR}=1-\mathrm{Recall}$) formular.
\subsection{Implement Detail} \subsection{Implement Detail}
All input images are cropped and resized to $800\times320$. Similar to \cite{clrnet}, we apply random affine transformations and random horizontal flips. For the optimization process, we use the AdamW \cite{adam} optimizer with a learning rate warm-up and a cosine decay strategy. The initial learning rate is set to 0.006. The number of sampled points and regression points for each lane anchor are set to 36 and 72, respectively. The power coefficient of cost function $\beta$ is set to 6. The training processing of the whole model (including LPM and GPM) is end-to-end just like \cite{adnet}\cite{srlane}. All the experiments are conducted on a single NVIDIA A100-40G GPU. To make our model simple, we only use CNN-based backbone, namely ResNet\cite{resnet} and DLA34\cite{dla}. Other details for datasets and training process can be seen in Appendix \ref{vis_appendix}. All input images are cropped and resized to $800\times320$. Similar to \cite{clrnet}, we apply random affine transformations and random horizontal flips. For the optimization process, we use the AdamW \cite{adam} optimizer with a learning rate warm-up and a cosine decay strategy. The initial learning rate is set to 0.006. The number of sampled points and regression points for each lane anchor are set to 36 and 72, respectively. The power coefficient of cost function $\beta$ is set to 6. The training processing of the whole model (including LPM and GPM) is end-to-end just like \cite{adnet}\cite{srlane}. All the experiments are conducted on a single NVIDIA A100-40G GPU. To make our model simple, we only use CNN-based backbone, namely ResNet\cite{resnet} and DLA34\cite{dla}. Other details can be seen in Appendix \ref{vis_appendix}.
\begin{table*}[htbp] \begin{table*}[htbp]
\centering \centering
\caption{Comparison results on CULane test set with other methods.} \caption{Comparison results on the CULane test set with other methods.}
\normalsize \normalsize
\begin{adjustbox}{width=\linewidth} \begin{adjustbox}{width=\linewidth}
\begin{tabular}{lrlllllllllll} \begin{tabular}{lrlllllllllll}
@ -463,7 +423,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
\begin{table}[h] \begin{table}[h]
\centering \centering
\caption{Comparison results on TuSimple test set with other methods.} \caption{Comparison results on the TuSimple test set with other methods.}
\begin{adjustbox}{width=\linewidth} \begin{adjustbox}{width=\linewidth}
\begin{tabular}{lrcccc} \begin{tabular}{lrcccc}
\toprule \toprule
@ -488,7 +448,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
\begin{table}[h] \begin{table}[h]
\centering \centering
\caption{Comparison results on LLAMAS test set with other methods.} \caption{Comparison results on the LLAMAS test set with other methods.}
\begin{adjustbox}{width=\linewidth} \begin{adjustbox}{width=\linewidth}
\begin{tabular}{lrcccc} \begin{tabular}{lrcccc}
\toprule \toprule
@ -515,7 +475,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
\begin{table}[h] \begin{table}[h]
\centering \centering
\caption{Comparison results on DL-Rail test set with other methods.} \caption{Comparison results on the DL-Rail test set with other methods.}
\begin{adjustbox}{width=\linewidth} \begin{adjustbox}{width=\linewidth}
\begin{tabular}{lrccc} \begin{tabular}{lrccc}
\toprule \toprule
@ -539,7 +499,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
\begin{table}[h] \begin{table}[h]
\centering \centering
\caption{Comparison results on CurveLanes validation set with other methods.} \caption{Comparison results on the CurveLanes validation set with other methods.}
\begin{adjustbox}{width=\linewidth} \begin{adjustbox}{width=\linewidth}
\begin{tabular}{lrcccc} \begin{tabular}{lrcccc}
\toprule \toprule
@ -567,7 +527,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
\subsection{Comparison with the state-of-the-art method} \subsection{Comparison with the state-of-the-art method}
The comparison results of our proposed model with other methods are shown in Tables \ref{culane result}, \ref{tusimple result}, \ref{llamas result}, \ref{dlrail result}, and \ref{curvelanes result}. We present results for two versions of our model: the NMS-based version, denoted as Polar R-CNN-NMS, and the NMS-free version, denoted as Polar R-CNN. The NMS-based version utilizes predictions $\left\{s_i^g\right\}$ obtained from the O2M head followed by NMS post-processing, while the NMS-free version derives predictions via dual confidence selection. The comparison results of our proposed model with other methods are shown in Tables \ref{culane result}, \ref{tusimple result}, \ref{llamas result}, \ref{dlrail result}, and \ref{curvelanes result}. We present results for two versions of our model: the NMS-based version, denoted as Polar R-CNN-NMS, and the NMS-free version, denoted as Polar R-CNN. The NMS-based version utilizes predictions $\left\{s_i^g\right\}$ obtained from the O2M head followed by NMS post-processing, while the NMS-free version derives predictions via dual confidence selection.
To ensure a fair comparison, we also include results for CLRerNet \cite{clrernet} on the CULane and CurveLanes datasets, as we use a similar training strategy and dataset splits. As illustrated in the comparison results, our model demonstrates competitive performance across five datasets. Specifically, on the CULane, TuSimple, LLAMAS, and DL-Rail datasets of sparse scenarios, our model outperforms other anchor-based methods. Additionally, the performance of the NMS-free version is nearly identical to that of the NMS-based version, highlighting the effectiveness of the O2O classification head in eliminating redundant predictions in the sparse scenarios. On the CurveLanes dataset, the NMS-free version achieves superior F1-measure and Recall compared to other methods. To ensure a fair comparison, we also include results for CLRerNet \cite{clrernet} on the CULane and CurveLanes datasets, as we use a similar training strategy and dataset splits. As illustrated in the comparison results, our model demonstrates competitive performance across five datasets. Specifically, on the CULane, TuSimple, LLAMAS, and DL-Rail datasets of sparse scenarios, our model outperforms other anchor-based methods. Additionally, the performance of the NMS-free version is nearly identical to that of the NMS-based version, highlighting the effectiveness of the O2O classification subhead in eliminating redundant predictions in sparse scenarios. On the CurveLanes dataset, the NMS-free version achieves superior F1-measure and Recall compared to other methods.
We also compare the number of anchors and processing speed with other methods. Fig. \ref{anchor_num_method} illustrates the number of anchors used by several anchor-based methods on CULane dataset. Our proposed model utilizes the fewest proposal anchors (20 anchors) while achieving the highest F1-score on CULane. It remains competitive with state-of-the-art methods like CLRerNet, which uses 192 anchors and a cross-layer refinement. Conversely, the sparse Laneformer, which also uses 20 anchors, does not achieve optimal performance. It is important to note that our model is designed with a simpler structure without complicated components such as cross-layer refinement, indicating the pivotal role of flexible anchors under polar coordinates in enhaning performance in sparse scenarios. Furthermore, due to its simple structure and fewer anchors, our model exhibits lower latency compared to most methods, as shown in Fig. \ref{speed_method}. We also compare the number of anchors and processing speed with other methods. Fig. \ref{anchor_num_method} illustrates the number of anchors used by several anchor-based methods on CULane dataset. Our proposed model utilizes the fewest proposal anchors (20 anchors) while achieving the highest F1-score on CULane. It remains competitive with state-of-the-art methods like CLRerNet, which uses 192 anchors and a cross-layer refinement. Conversely, the sparse Laneformer, which also uses 20 anchors, does not achieve optimal performance. It is important to note that our model is designed with a simpler structure without complicated components such as cross-layer refinement, indicating the pivotal role of flexible anchors under polar coordinates in enhaning performance in sparse scenarios. Furthermore, due to its simple structure and fewer anchors, our model exhibits lower latency compared to most methods, as shown in Fig. \ref{speed_method}.
\begin{figure}[t] \begin{figure}[t]
@ -663,11 +623,11 @@ We also explore the effect of different local polar map sizes on our model, as i
\label{cam} \label{cam}
\end{figure} \end{figure}
\textbf{Ablation study on NMS-free block in sparse scenarios.} We conduct several experiments on the CULane dataset to evaluate the performance of the NMS-free paradigm in sparse scenarios. As shown in Table \ref{aba_NMSfree_block}, without using the GNN to establish relationships between anchors, Polar R-CNN fails to achieve a NMS-free paradigm, even with one-to-one assignment. Furthermore, the confidence comparison matrix $\boldsymbol{A}^{C}$ proves crucial, indicating that conditional probability is effective. Other components, such as the geometric prior matrix $\boldsymbol{A}^{G}$ and rank loss, also contribute to the performance of the NMS-free block. \textbf{Ablation study on NMS-free block in sparse scenarios.} We conduct several experiments on the CULane dataset to evaluate the performance of the NMS-free paradigm in sparse scenarios. As shown in Table \ref{aba_NMSfree_block}, without using the GNN to establish relationships between anchors, Polar R-CNN fails to achieve a NMS-free paradigm, even with one-to-one assignment. Furthermore, confidence-prior adjacency matrix $\boldsymbol{A}^{C}$ proves crucial, indicating that conditional probability is effective. Other components, such as the geometric-prior adjacency matrix $\boldsymbol{A}^{G}$ and rank loss, also contribute to the performance of the NMS-free block.
To compare the NMS-free paradigm with the traditional NMS paradigm, we perform experiments with the NMS-free block under both proposal and fixed anchor strategies (employing a fixed set of anchors as illustrated in Fig. \ref{anchor setting}(b)). Table \ref{NMS vs NMS-free} presents the results of these experiments. In the table, ``O2M'' and ``O2O'' refer to the NMS (the gray dashed route in Fig. \ref{o2o_cls_head}) and NMS-free paradigms (the green route in Fig. \ref{o2o_cls_head}) respectively. The suffix ``-B'' signifies that the head consists solely of MLPs, whereas ``-G'' indicates that the head is equipped with the GNN architecture. In the fixed anchor paradigm, although the O2O classification head without GNN effectively eliminates redundant predictions, the performance still improved by incorporating GNN structure. In the proposal anchor paradigm, the O2O classification head without GNN fails to eliminate redundant predictions due to high anchor overlaps. Thus, the GNN structure is essential for Polar R-CNN in the NMS-free paradigm. In both the fixed and proposed anchor paradigms, the O2O classification head with the GNN structure successfully eliminates redundant predictions, indicating that our GNN-based O2O classification head can supplant the NMS post-processing in sparse scenarios without decline in performance. To compare the NMS-free paradigm with the traditional NMS paradigm, we perform experiments with the NMS-free block under both proposal and fixed anchor strategies (employing a fixed set of anchors as illustrated in Fig. \ref{anchor setting}(b)). Table \ref{NMS vs NMS-free} presents the results of these experiments. In the table, ``O2M'' and ``O2O'' refer to the NMS (the gray dashed route in Fig. \ref{gpm}) and NMS-free paradigms (the green route in Fig. \ref{gpm}) respectively. The suffix ``-B'' signifies that the head consists solely of MLPs, whereas ``-G'' indicates that the head is equipped with the GNN architecture. In the fixed anchor paradigm, although the O2O classification subhead without GNN effectively eliminates redundant predictions, the performance still improved by incorporating GNN structure. In the proposal anchor paradigm, the O2O classification subhead without GNN fails to eliminate redundant predictions due to high anchor overlaps. Thus, the GNN structure is essential for Polar R-CNN in the NMS-free paradigm. In both the fixed and proposed anchor paradigms, the O2O classification subhead with the GNN structure successfully eliminates redundant predictions, indicating that our GNN-based O2O classification subhead can supplant the NMS post-processing in sparse scenarios without decline in performance.
We also explore the stop-gradient strategy for the O2O classification head. As shown in Table \ref{stop}, the gradient of the O2O classification head negatively impacts both the O2M classification head (with NMS post-processing) and the O2O classification head. This observation indicates that the one-to-one assignment induces significant bias into feature learning, thereby underscoring the necessity of the stop-gradient strategy to preserve optimal performance. We also explore the stop-gradient strategy for the O2O classification subhead. As shown in Table \ref{stop}, the gradient of the O2O classification subhead negatively impacts both the O2M classification subhead (with NMS post-processing) and the O2O classification subhead. This observation indicates that the one-to-one assignment induces significant bias into feature learning, thereby underscoring the necessity of the stop-gradient strategy to preserve optimal performance.
\begin{table}[h] \begin{table}[h]
\centering \centering
@ -689,10 +649,6 @@ We also explore the stop-gradient strategy for the O2O classification head. As s
\end{table} \end{table}
\begin{table}[h] \begin{table}[h]
\centering \centering
\caption{The ablation study for NMS and NMS-free on CULane test set.} \caption{The ablation study for NMS and NMS-free on CULane test set.}
@ -702,8 +658,8 @@ We also explore the stop-gradient strategy for the O2O classification head. As s
\multicolumn{2}{c|}{\textbf{Anchor strategy~/~assign}} & \textbf{F1@50 (\%)} & \textbf{Precision (\%)} & \textbf{Recall (\%)} \\ \multicolumn{2}{c|}{\textbf{Anchor strategy~/~assign}} & \textbf{F1@50 (\%)} & \textbf{Precision (\%)} & \textbf{Recall (\%)} \\
\midrule \midrule
\multirow{6}*{Fixed} \multirow{6}*{Fixed}
&O2M-B w/~ NMS &80.38&87.44&74.38\\ &O2M w/~ NMS &80.38&87.44&74.38\\
&O2M-B w/o NMS &44.03\textcolor{darkgreen}{~(36.35$\downarrow$)}&31.12\textcolor{darkgreen}{~(56.32$\downarrow$)}&75.23\textcolor{red}{~(0.85$\uparrow$)}\\ &O2M w/o NMS &44.03\textcolor{darkgreen}{~(36.35$\downarrow$)}&31.12\textcolor{darkgreen}{~(56.32$\downarrow$)}&75.23\textcolor{red}{~(0.85$\uparrow$)}\\
\cline{2-5} \cline{2-5}
&O2O-B w/~ NMS &78.72&87.58&71.50\\ &O2O-B w/~ NMS &78.72&87.58&71.50\\
&O2O-B w/o NMS &78.23\textcolor{darkgreen}{~(0.49$\downarrow$)}&86.26\textcolor{darkgreen}{~(1.32$\downarrow$)}&71.57\textcolor{red}{~(0.07$\uparrow$)}\\ &O2O-B w/o NMS &78.23\textcolor{darkgreen}{~(0.49$\downarrow$)}&86.26\textcolor{darkgreen}{~(1.32$\downarrow$)}&71.57\textcolor{red}{~(0.07$\uparrow$)}\\
@ -712,8 +668,8 @@ We also explore the stop-gradient strategy for the O2O classification head. As s
&O2O-G w/o NMS &80.27\textcolor{darkgreen}{~(0.10$\downarrow$)}&87.14\textcolor{darkgreen}{~(0.30$\downarrow$)}&74.40\textcolor{red}{~(0.03$\uparrow$)}\\ &O2O-G w/o NMS &80.27\textcolor{darkgreen}{~(0.10$\downarrow$)}&87.14\textcolor{darkgreen}{~(0.30$\downarrow$)}&74.40\textcolor{red}{~(0.03$\uparrow$)}\\
\midrule \midrule
\multirow{6}*{Proposal} \multirow{6}*{Proposal}
&O2M-B w/~ NMS &80.81&88.53&74.33\\ &O2M w/~ NMS &80.81&88.53&74.33\\
&O2M-B w/o NMS &36.46\textcolor{darkgreen}{~(44.35$\downarrow$)}&24.09\textcolor{darkgreen}{~(64.44$\downarrow$)}&74.93\textcolor{red}{~(0.6$\uparrow$)}\\ &O2M w/o NMS &36.46\textcolor{darkgreen}{~(44.35$\downarrow$)}&24.09\textcolor{darkgreen}{~(64.44$\downarrow$)}&74.93\textcolor{red}{~(0.6$\uparrow$)}\\
\cline{2-5} \cline{2-5}
&O2O-B w/~ NMS &77.27&92.64&66.28\\ &O2O-B w/~ NMS &77.27&92.64&66.28\\
&O2O-B w/o NMS &47.11\textcolor{darkgreen}{~(30.16$\downarrow$)}&36.48\textcolor{darkgreen}{~(56.16$\downarrow$)}&66.48\textcolor{red}{~(0.20$\uparrow$)}\\ &O2O-B w/o NMS &47.11\textcolor{darkgreen}{~(30.16$\downarrow$)}&36.48\textcolor{darkgreen}{~(56.16$\downarrow$)}&66.48\textcolor{red}{~(0.20$\uparrow$)}\\
@ -754,9 +710,9 @@ We also explore the stop-gradient strategy for the O2O classification head. As s
\textbf{Ablation study on NMS-free block in dense scenarios.} Despite demonstrating the feasibility of replacing NMS with the O2O classification head in sparse scenarios, the shortcomings of NMS in dense scenarios remain. To investigate the performance of the NMS-free block in dense scenarios, we conduct experiments on the CurveLanes dataset, as detailed in Table \ref{aba_NMS_dense}. \textbf{Ablation study on NMS-free block in dense scenarios.} Despite demonstrating the feasibility of replacing NMS with the O2O classification subhead in sparse scenarios, the shortcomings of NMS in dense scenarios remain. To investigate the performance of the NMS-free block in dense scenarios, we conduct experiments on the CurveLanes dataset, as detailed in Table \ref{aba_NMS_dense}.
In the traditional NMS post-processing \cite{clrernet}, the default IoU threshold is set to 50 pixels. However, this default setting may not always be optimal, especially in dense scenarios where some lane predictions might be erroneously eliminated. Lowering the IoU threshold increases recall but decreases precision. To find the most effective IoU threshold, we experimented with various values and found that a threshold of 15 pixels achieves the best trade-off, resulting in an F1-score of 86.81\%. In contrast, the NMS-free paradigm with the GNN-based O2O classification head achieves an overall F1-score of 87.29\%, which is 0.48\% higher than the optimal threshold setting in the NMS paradigm. Additionally, both precision and recall are improved under the NMS-free approach. This indicates the O2O classification head with proposed GNN structure is capable of learning both explicit geometric distance and implicit semantic distances between anchors, thus providing a more effective solution for dense scenarios compared to the traditional NMS post-processing. In the traditional NMS post-processing \cite{clrernet}, the default IoU threshold is set to 50 pixels. However, this default setting may not always be optimal, especially in dense scenarios where some lane predictions might be erroneously eliminated. Lowering the IoU threshold increases recall but decreases precision. To find the most effective IoU threshold, we experimented with various values and found that a threshold of 15 pixels achieves the best trade-off, resulting in an F1-score of 86.81\%. In contrast, the NMS-free paradigm with the GNN-based O2O classification subhead achieves an overall F1-score of 87.29\%, which is 0.48\% higher than the optimal threshold setting in the NMS paradigm. Additionally, both precision and recall are improved under the NMS-free approach. This indicates the O2O classification subhead with proposed GNN structure is capable of learning both explicit geometric distance and implicit semantic distances between anchors, thus providing a more effective solution for dense scenarios compared to the traditional NMS post-processing.
\begin{table}[h] \begin{table}[h]
\centering \centering
@ -786,7 +742,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
\section{Conclusion and Future Work} \section{Conclusion and Future Work}
In this paper, we propose Polar R-CNN to address two key issues in anchor-based lane detection methods. By incorporating a local and global polar coordinate system, our Polar R-CNN achieves improved performance with fewer anchors. Additionally, the introduction of the O2O classification head with GNN block allows us to replace the traditional NMS post-processing, and the NMS-free paradigm demonstrates superior performance in dense scenarios. Our model is highly flexible and the number of anchors can be adjusted based on the specific scenario. Users have the option to use either the O2M classification head with NMS post-processing or the O2O classification head for a NMS-free approach. Polar R-CNN is also deployment-friendly due to its simple structure, making it a potential new baseline for lane detection. Future work could explore incorporating new structures, such as large kernels or attention mechanisms, and experimenting with new label assignment, training, and anchor sampling strategies. We also plan to extend Polar R-CNN to video instance lane detection and 3D lane detection, utilizing advanced geometric modeling for these new tasks. In this paper, we propose Polar R-CNN to address two key issues in anchor-based lane detection methods. By incorporating a local and global polar coordinate system, our Polar R-CNN achieves improved performance with fewer anchors. Additionally, the introduction of the O2O classification subhead with GNN block allows us to replace the traditional NMS post-processing, and the NMS-free paradigm demonstrates superior performance in dense scenarios. Our model is highly flexible and the number of anchors can be adjusted based on the specific scenario. Users have the option to use either the O2M classification subhead with NMS post-processing or the O2O classification subhead for a NMS-free approach. Polar R-CNN is also deployment-friendly due to its simple structure, making it a potential new baseline for lane detection. Future work could explore incorporating new structures, such as large kernels or attention mechanisms, and experimenting with new label assignment, training, and anchor sampling strategies. We also plan to extend Polar R-CNN to video instance lane detection and 3D lane detection, utilizing advanced geometric modeling for these new tasks.
% %
% %
% %
@ -806,6 +762,16 @@ In this paper, we propose Polar R-CNN to address two key issues in anchor-based
received the B.Sc. and Ph.D. degrees from Xian Jiaotong University, Xian, China, in 2012 and 2018, respectively. From 2016 to 2017, he was a Visiting Scholar with Columbia University, New York, NY, USA. He is an Associate Professor with the School of Computer Science and Technology, Xian Jiaotong University. His research interests include statistical modeling received the B.Sc. and Ph.D. degrees from Xian Jiaotong University, Xian, China, in 2012 and 2018, respectively. From 2016 to 2017, he was a Visiting Scholar with Columbia University, New York, NY, USA. He is an Associate Professor with the School of Computer Science and Technology, Xian Jiaotong University. His research interests include statistical modeling
and image processing. and image processing.
\end{IEEEbiography} \end{IEEEbiography}
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/photo_ZengjieSong.jpg}}]{Zengjie Song}
received the B.S. degree in applied mathematics and the Ph.D. degree in statistics from the Xian Jiaotong University (XJTU), Xian, China, in 2013 and 2020, respectively. From 2017 to 2018, he was a visiting Ph.D. student with the Department of Computer Science, University of Illinois at UrbanaChampaign, Champaign, IL, USA. From 2020 to 2023, he was a Postdoctoral Researcher with the Institute of Automation, Chinese Academy of Sciences, Beijing, China. In May 2023, he returned to the XJTU, where he is currently an Assistant Professor with the Department of Statistics. His research interests include predictive coding, multimodal learning, generative model, and computer vision, with an emphasis on the intersection of machine learning and computational neuroscience.
\end{IEEEbiography}
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thesis_figure/sunkai.jpg}}]{Kai Sun}
received his Ph.D. degree in statistics from Xi'an Jiaotong University, Xi'an, China, in 2020. He jointed Xi'an Jiaotong University, China, in 2020, where he is currently an associate professor in School of Mathematics and Statistics. His research interests include deep learning and image processing. Up to now, he has authored and coauthored one monograph and 20+ academic papers, primarily in journals such as IEEE TIP, IEEE TNNLS and others. Additionally, he has published one ESI highly cited paper and ESI hot paper as the first author.
\end{IEEEbiography}
\vfill \vfill
@ -855,22 +821,21 @@ where $p_{i,j}^{s}$ represents the $i$-th sampled point of the $j$-th lane ancho
\section{The Design Principles of the One-to-one classification Head} \section{The Design Principles of the One-to-one classification Head}
Two fundamental prerequisites of the NMS-free framework lie in the label assignment strategies and the head structures. Two fundamental prerequisites of the NMS-free framework lie in the label assignment strategies and the head structures.
As for the label assignment strategy, previous work use one-to-many label assignments such as SimOTA\cite{yolox}. One-to-many label assignment make the detection head make redundant preidictions for one ground truth, resulting in the need of NMS post-processing. Thus, some works \cite{detr}\cite{learnNMS} proposed one-to-one label assignment such as Hungarian algorithm. This force the model to predict one positive samples for one ground truth. As for the label assignment strategy, previous work use one-to-many label assignments, which make the detection head make redundant predictions for one ground truth, resulting in the need of NMS post-processing. Thus, some works \cite{detr}\cite{learnNMS} proposed one-to-one label assignment such as Hungarian algorithm. This force the model to predict one positive sample for each lane.
However, directly using one-to-one label assignment damage the learning of the model, and the plain structure such as MLPs and CNNs struggle to assimilate the ``one-to-one'' characteristics, resulting in the decreasing of performance compared to one-to-many label assignments with NMS post-processing\cite{yolov10}\cite{o2o}. Consider a trival example: Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from the $i$-th anchor, and the model is trained with one-to-one label assignment. Assuming that the $i$-th anchor and the $j$-th anchor are both close to the ground truth and overlap with each other, we can express as follows: However, directly using one-to-one label assignment damage the learning of the model, and structures such as MLPs and CNNs struggle to assimilate the ``one-to-one'' characteristics, resulting in the decreasing of performance compared to one-to-many label assignments with NMS post-processing\cite{yolov10}\cite{o2o}. Consider a trivial example: Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from the $i$-th anchor, and the model is trained with one-to-one label assignment. Assuming that the $i$-th anchor and the $j$-th anchor are both close to the ground truth and overlap with each other. So the corresponding RoI features are similar, which can be expressed as follows:
\begin{align} \begin{align}
\boldsymbol{F}_{i}^{roi}\approx \boldsymbol{F}_{j}^{roi}. \boldsymbol{F}_{i}^{roi}\approx \boldsymbol{F}_{j}^{roi}.
\end{align} \end{align}
This indicates that the RoI pooling features of the two anchors are similar. Suppose that $\boldsymbol{F}^{roi}_{i}$ is designated as a positive sample while $\boldsymbol{F}^{roi}_{j}$ as a negative sample, the ideal outcome should manifest as: Suppose that $\boldsymbol{F}^{roi}_{i}$ is assigned as a positive sample while $\boldsymbol{F}^{roi}_{j}$ as a negative sample, the ideal outcome should manifest as:
\begin{align} \begin{align}
\boldsymbol{F}_{cls}^{plain}\left( \boldsymbol{F}_{i}^{roi} \right) &\rightarrow 1, f_{cls}\left( \boldsymbol{F}_{i}^{roi} \right) &\rightarrow 1, \notag\\
\\ f_{cls}\left( \boldsymbol{F}_{j}^{roi} \right) &\rightarrow 0,
\boldsymbol{F}_{cls}^{plain}\left( \boldsymbol{F}_{j}^{roi} \right) &\rightarrow 0,
\label{sharp fun} \label{sharp fun}
\end{align} \end{align}
where $\boldsymbol{F}_{cls}^{plain}$ represents a classification head characterized by a plain architecture. The Eq. (\ref{sharp fun}) implies that the property of $\boldsymbol{F}_{cls}^{plain}$ need to be ``sharp'' enough to differentiate between two similar features. In other words, the output of $\boldsymbol{F}_{cls}^{plain}$ changes rapidly over short periods or distances. This ``sharp'' pattern is hard to train for MLPs or CNNs \cite{o3d} solely. Consequently, additional new heuristic structures like \cite{o3d}\cite{relationnet} need to be developed. where $f_{cls}$ represents a classification head with an ordinary structure such as MLPs and CNNs. The Eq. (\ref{sharp fun}) implies that the property of $f_{cls}$ need to be ``sharp'' enough to differentiate between two similar features. In other words, the output of $f_{cls}$ changes rapidly over short periods or distances. This ``sharp'' pattern is hard to train for MLPs or CNNs solely. Consequently, additional new heuristic structures like \cite{o3d}\cite{relationnet} need to be developed.
We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classification head. Fast NMS serves as an iteration-free post-processing algorithm based on traditional NMS. Furthermore, we have incorporated a sort-free strategy along with geometric priors into Fast NMS, with the specifics delineated in Algorithm \ref{Graph Fast NMS}. We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classification subhead. Fast NMS serves as an iteration-free post-processing algorithm based on traditional NMS. Furthermore, we have incorporated a sort-free strategy along with geometric priors into Fast NMS, with the specifics delineated in Algorithm \ref{Graph Fast NMS}.
\begin{figure}[t] \begin{figure}[t]
@ -887,11 +852,11 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
\REQUIRE ~~\\ %算法的输入参数Input \REQUIRE ~~\\ %算法的输入参数Input
The index of all anchors, $1, 2, ..., i, ..., K$;\\ The index of all anchors, $1, 2, ..., i, ..., K$;\\
The positive corresponding anchors, $\left\{ \theta _i,r_{i}^{g} \right\} |_{i=1}^{K}$;\\ The positive corresponding anchors, $\left\{ \theta _i,r_{i}^{g} \right\} |_{i=1}^{K}$;\\
The confidence emanating from the O2M classification head, $s_i^g$;\\ The confidence emanating from the O2M classification subhead, $s_i^g$;\\
The regressions emanating from the O2M regression head, denoted as $\left\{ Lane_i \right\} |_{i=1}^{K}$\\ The regressions emanating from the O2M regression subhead, denoted as $\left\{ Lane_i \right\} |_{i=1}^{K}$\\
The predetermined thresholds $\tau^\theta$, $\tau^r$, $\tau_d$ and $\lambda _{o2m}^{s}$. The predetermined thresholds $\tau^\theta$, $\tau^r$, $\tau_d$ and $\lambda _{o2m}^{s}$.
\ENSURE ~~\\ %算法的输出Output \ENSURE ~~\\ %算法的输出Output
\STATE Caculate the confidence comparison matrix $\boldsymbol{A}^{C}\in\mathbb{R}^{K\times K}$, defined as follows: \STATE Calculate the confidence-prior adjacency matrix $\boldsymbol{A}^{C}\in\mathbb{R}^{K\times K}$, defined as follows:
\begin{align} \begin{align}
A_{ij}^{C}=\begin{cases} A_{ij}^{C}=\begin{cases}
1, s_i>s_j\,\,or\,\,\left( s_i^g=s_j^g\,\,and\,\,i>j \right)\\ 1, s_i>s_j\,\,or\,\,\left( s_i^g=s_j^g\,\,and\,\,i>j \right)\\
@ -899,7 +864,7 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
\end{cases} \end{cases}
\label{confidential matrix} \label{confidential matrix}
\end{align} \end{align}
\STATE Calculate the geometric prior matrix $\boldsymbol{A}^{G}\in\mathbb{R}^{K\times K}$, which is defined as follows: \STATE Calculate the geometric-prior adjacency matrix $\boldsymbol{A}^{G}\in\mathbb{R}^{K\times K}$, which is defined as follows:
\begin{align} \begin{align}
A_{ij}^{G}=\begin{cases} A_{ij}^{G}=\begin{cases}
1, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\tau^r\\ 1, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\tau^r\\
@ -916,14 +881,15 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
\STATE Define the adjacent matrix $\boldsymbol{A} = \boldsymbol{A}^{C} \odot \boldsymbol{A}^{G}$ and the final confidence $\tilde{s}_i^g$ is calculate as following: \STATE Define the adjacent matrix $\boldsymbol{A} = \boldsymbol{A}^{C} \odot \boldsymbol{A}^{G}$ and the final confidence $\tilde{s}_i^g$ is calculate as following:
\begin{align} \begin{align}
\tilde{s}_{i}^{g}=\begin{cases} \tilde{s}_{i}^{g}=\begin{cases}
1, \mathrm{if}\underset{D_{ki}\in \{D_{ki}\mid A_{ki}=1\}}{\max}D_{ki}<\left( \tau ^d \right) ^{-1},\\ 1, \mathrm{Max}\left(\mathcal{D}(:,j)|\boldsymbol{A}(:,j)=1\right)<\left( \tau ^d \right) ^{-1},\\
0, \mathrm{otherwise}\\ 0, \mathrm{otherwise}\\
\end{cases} \end{cases}
\label{al_1-4} \label{al_1-4}
\end{align} \end{align}
where $j=1,2,\cdots,K$ and $\mathrm{Max}(\cdot|\boldsymbol{A}(:,j)=1)$ is a max operator along the $j$-th column of adjacency matrix $\boldsymbol{A}$ with the element $A_{:j}=1$.
\STATE Get the final selection set: \STATE Get the final selection set:
\begin{align} \begin{align}
\varOmega_{nms}^{pos}=\left\{ i|s_{i}^{g}>\lambda _{o2m}^{s}\,\,and\,\,\tilde{s}_{i}^{g}=1 \right\} \varOmega_{nms}^{pos}=\left\{ i|s_{j}^{g}>\tau_{o2m}\,\,and\,\,\tilde{s}_{j}^{g}=1 \right\}
\label{al_1-5} \label{al_1-5}
\end{align} \end{align}
@ -932,34 +898,23 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
\label{Graph Fast NMS} \label{Graph Fast NMS}
\end{algorithm} \end{algorithm}
The new algorithm has a distinct format from the original one\cite{yolact}. The geometric prior $\boldsymbol{A}_{G}$ indicated that predictions associated with adequately proximate anchors were likely to suppress one another. It is straightforward to demonstrate that, when all elements within $\boldsymbol{A}_{G}$ are all set to 1 (disregarding geometric priors), Algorithm \ref{Graph Fast NMS} is equivalent to Fast NMS. Building upon our newly proposed sort-free Fast NMS with geometric prior, we can design the structure of the one-to-one classification head. The new algorithm possesses a distinctly different format from its predecessor\cite{yolact}. We introduce a geometric-prior adjacency matrix characterized by $\boldsymbol{A}^G$, alleviating the suppression relationship between disparate anchors. It is manifestly to demonstrate that, when all elements within $\boldsymbol{A}^{G}$ are all set as $1$ (\textit{i.e.} disregarding geometric priors), Algorithm \ref{Graph Fast NMS} is equivalent to Fast NMS. Building upon our newly proposed sort-free Fast NMS with geometric prior, we design the structure of the one-to-one classification head.
The principal limitations of the NMS lie in the definitions of distance derived from geometry (i.e., Eq. (\ref{al_1-3})) and the threshold $\lambda^{g}$ employed to eliminate redundant predictions (i.e., Eq. (\ref{al_1-4})). For instance, in the scenario of double lines, despite the minimal geometric distance between the two lane instances, their semantic divergence is strikingly distinct. Consequently, we replace the above two steps with trainable neural networks, allowing them to learn the semantic distance in a data-driven fashion. The neural network blocks to replace Eq. (\ref{al_1-3}) are expressed as: The principal limitations of the NMS lie in two steps, namely the definitions of distance stem from geometry (\textit{i.e.}, Eq. (\ref{al_1-3})) and the threshold $\lambda^{g}$ employed to eliminate redundant predictions (\textit{i.e.}, Eq. (\ref{al_1-4})). For instance, in the scenarios involving double lines, despite the minimal geometric distance between the two lane instances, their semantic divergence is remarkably pronounced. Consequently, we replace the aforementioned two steps with trainable neural networks, allowing them to alleviate the limitation of Fast NMS in a data-driven fashion. The neural network blocks to replace Eq. (\ref{al_1-3}) are Eqs. (\ref{edge_layer_1})-(\ref{edge_layer_3}) in the main text.
% \begin{align}
% \tilde{\boldsymbol{F}}_{i}^{roi}&\gets \mathrm{ReLU}\left( \boldsymbol{W}_{roi}\boldsymbol{F}_{i}^{roi}+\boldsymbol{b}_{roi} \right) ,\label{edge_layer_1_appendix}\\ In Eq. (\ref{edge_layer_3}), the inverse distance $\boldsymbol{D}_{ij}^{edge}\in\mathbb{R}^{d_n}$ transcends its scalar form, encapsulating the semantic distance between predictions.
% \boldsymbol{F}_{ij}^{edge}&\gets \boldsymbol{W}_{in}\tilde{\boldsymbol{F}}_{j}^{roi}-\boldsymbol{W}_{out}\tilde{\boldsymbol{F}}_{i}^{roi},\label{edge_layer_2_appendix}\\ We use element-wise max pooling for the tensor, as the repalcement of the max operation applied to scalar, as delineated in Eq. (\ref{maxpooling}) in the main text. Furthermore, the predetermined $\left( \tau ^d \right) ^{-1}$ is no longer utilized as the threshold of the distance. We defined a neural work as the implicit decision plane to formulate the final score $\tilde{s}_{i}^{g}$, as defined in Eq. (\ref{node_layer}), serving as the replacement of Eq. (\ref{al_1-4}).
% \tilde{\boldsymbol{F}}_{ij}^{edge}&\gets \boldsymbol{F}_{ij}^{edge}+\boldsymbol{W}_s\left( \boldsymbol{x}_{j}^{s}-\boldsymbol{x}_{i}^{s} \right) +\boldsymbol{b}_s,\label{edge_layer_3_appendix}\\
% \boldsymbol{D}_{ij}^{edge}&\gets \mathrm{MLP}_{edge}\left( \tilde{\boldsymbol{F}}_{ij}^{edge} \right) .\label{edge_layer_4_appendix} The score $\tilde{s}_{i}^{g}$ output by the neural network transitions from a binary score to a continuous soft score ranging from 0 to 1. We introduce a new threshold $\lambda^s_{o2o}$ within the updated criteria of Eq. (\ref{al_1-5}):
% \end{align}
where the inverse distance $\boldsymbol{D}_{ij}^{edge}\in\mathbb{R}^{d_n}$ is no longer a scalar but a tensor. We use element-wise max pooling for tensor to repalce the max operation for scalear. So the $\left( \tau ^d \right) ^{-1}$ can vbe no longer employed as the threshold of the distance. Furthermore, the predetermined $\left( \tau ^d \right) ^{-1}$ can be no longer employed as the threshold of the distance. We defined a neural work as a implicit decision plane to formulate the final score $\tilde{s}_{i}^{g}$. The replacement of Eq. (\ref{al_1-4}) is constructed as follows:So We also use a The replacement of Eq. (\ref{al_1-4}) is constructed as follows:
% \begin{align}
% \boldsymbol{D}_{i}^{node}&\gets \underset{\boldsymbol{D}_{ki}^{edge}\in \left\{ \boldsymbol{D}_{ki}^{edge}|A_{ki}=1 \right\}}{\max}\boldsymbol{D}_{ki}^{edge}.
% \\
% \boldsymbol{F}_{i}^{node}&\gets \mathrm{MLP}_{node}\left( \boldsymbol{D}_{i}^{node} \right) ,
% \\
% \tilde{s}_{i}^{g}&\gets \sigma \left( \boldsymbol{W}_{node}\boldsymbol{F}_{i}^{node} + \boldsymbol{b}_{node} \right).
% \label{node_layer_appendix}
% \end{align}
In this expression, the score $\tilde{s}_{i}^{g}$ transitions from a binary score to a continuous soft score ranging from 0 to 1. We introduce a new threshold $\lambda^s_{o2o}$ within the replacement criteria of Eq. (\ref{al_1-5}):
\begin{align} \begin{align}
\varOmega_{nms}^{pos}=\left\{ i|s_{i}^{g}>\lambda _{o2m}^{s}\,\,and\,\,\tilde{s}_{i}^{g}>\lambda^s_{o2o}\right\}. \varOmega_{nms-free}^{pos}=\left\{ j|s_{j}^{g}>\tau_{o2m}\,\,and\,\,\tilde{s}_{j}^{g}>\tau_{o2o}\right\}.
\end{align} \end{align}
This criteria is also referred to as the \textit{dual confidence selection} in the main text. This criteria is also referred to as the \textit{dual confidence selection} in the main text.
\label{NMS_appendix} \label{NMS_appendix}
\begin{table*}[htbp] \begin{table*}[htbp]
\centering \centering
\caption{Infos and hyperparameters for five datasets. For CULane, $*$ denotes the actual number of training samples used to train our model. Labels for some validation/test sets are missing; therefore, selected different splits (test or validation set) are selected for different datasets.} \caption{Infos and hyperparameters for five datasets. For the CULane dataset, $*$ denotes the actual number of training samples used to train our model. Labels for some validation/test sets are missing; therefore, selected different splits (test or validation set) are selected for different datasets.}
\begin{adjustbox}{width=\linewidth} \begin{adjustbox}{width=\linewidth}
\begin{tabular}{l|l|ccccc} \begin{tabular}{l|l|ccccc}
\toprule \toprule
@ -991,9 +946,9 @@ This criteria is also referred to as the \textit{dual confidence selection} in t
\multirow{4}*{Evaluation Hyperparameter} \multirow{4}*{Evaluation Hyperparameter}
& $H^{l}\times W^{l}$ &$4\times10$&$4\times10$&$4\times10$&$4\times10$&$6\times13$\\ & $H^{l}\times W^{l}$ &$4\times10$&$4\times10$&$4\times10$&$4\times10$&$6\times13$\\
& $K$ &20&20&20&12&50\\ & $K$ &20&20&20&12&50\\
& $d$ &5&8&10&5&5\\ & $d_n$ &5&8&10&5&5\\
& $C_{o2m}$ &0.48&0.40&0.40&0.40&0.45\\ & $\tau_{o2m}$ &0.48&0.40&0.40&0.40&0.45\\
& $C_{o2o}$ &0.46&0.46&0.46&0.46&0.44\\ & $\tau_{o2o}$ &0.46&0.46&0.46&0.46&0.44\\
\bottomrule \bottomrule
\end{tabular} \end{tabular}
\end{adjustbox} \end{adjustbox}
@ -1009,27 +964,24 @@ This criteria is also referred to as the \textit{dual confidence selection} in t
\label{glaneiou} \label{glaneiou}
\end{figure} \end{figure}
\section{The Details of Intersection Over Union between Lane Instances} \section{Details of Intersection Over Union between Lane Instances}
To make the IoU between lane instances consistent with that of general object detection methods \cite{iouloss}\cite{giouloss}, we have redefined the lane IoU. As illustrated in Fig. \ref{glaneiou}, the newly-defined IoU of lanes, which we term as GLaneIoU, is articulated as follows: To ensure the IoU between lane instances aligns with the conventions of general object detection methods \cite{iouloss}\cite{giouloss}, we have redefined the IoU of lane pairs. As depicted in Fig. \ref{glaneiou}, the newly defined IoU for lanes pairs, which we refer to as GLaneIoU, is elaborated as follows:
\begin{align} \begin{align}
\Delta x_{i,p}^{d}&=x_{i+1,p}^{d}-x_{i-1,p}^{d},\,\, \Delta y_{i,p}^{d}=y_{i+1,p}^{d}-y_{i-1,p}^{d}, \\ \Delta x_{i,p}^{d}&=x_{i+1,p}^{d}-x_{i-1,p}^{d},\,\, \Delta y_{i,p}^{d}=y_{i+1,p}^{d}-y_{i-1,p}^{d}, \\
w_{i,p}&=\frac{\sqrt{\left( \Delta x_{i,p}^{d} \right) ^2+\left( \Delta y_{i,p}^{d} \right) ^2}}{\Delta y_{i,p}^{d}}w^b,\\ w_{i,p}&=\frac{\sqrt{\left( \Delta x_{i,p}^{d} \right) ^2+\left( \Delta y_{i,p}^{d} \right) ^2}}{\Delta y_{i,p}^{d}}w^b,\\
b_{i,p}^{l}&=x_{i,p}^{d}-w_{i,p},\,\, b_{i,p}^{r}=x_{i,p}^{d}+w_{i,p}\,\, \\ b_{i,p}^{l}&=x_{i,p}^{d}-w_{i,p},\,\, b_{i,p}^{r}=x_{i,p}^{d}+w_{i,p},\,\, \\
\end{align} \end{align}
where $w^{b}$ is the base semi-width parameter and $w_{i,p}&$ is the actual semi-width of $p$-th lane instance. $\left\{ b_{i,p}^{l} \right\} _{i=1}^{N}$ and $\left\{ b_{i,p}^{r} \right\} _{i=1}^{N}$ denotes the left boundaries and the right boundaries if the $p$-th lane instance. Then we defined inter and union between lane instances: where $w^{b}$ is the base semi-width parameter and $w_{i,p}$ is the actual semi-width of $p$-th lane instance. The sets $\left\{ b_{i,p}^{l} \right\} _{i=1}^{N}$ and $\left\{ b_{i,p}^{r} \right\} _{i=1}^{N}$ signify the left and right boundaries of the $p$-th lane instance. Subsequently, we defined inter and union between lane instances:
\begin{align} \begin{align}
d_{i,pq}^{\mathcal{O}}&=\max \left( \min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\ d_{i,pq}^{\mathcal{O}}&=\max \left( \min \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\
d_{i,pq}^{\xi}&=\max \left( \max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\ d_{i,pq}^{\xi}&=\max \left( \max \left( b_{i,p}^{l}, b_{i,q}^{l} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right) , 0 \right),\\
d_{i,pq}^{\mathcal{U}}&=\max \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right), d_{i,pq}^{\mathcal{U}}&=\max \left( b_{i,p}^{r}, b_{i,q}^{r} \right) -\min \left( b_{i,p}^{l}, b_{i,q}^{l} \right).
\end{align} \end{align}
The definations of $\left\{d_{i,pq}^{\mathcal{O}}\right\}_{i=1}^{N}$, $\left\{d_{i,pq}^{\xi}\right\}_{i=1}^{N}$ and $\left\{d_{i,pq}^{\mathcal{U}}\right\}_{i=1}^{N}$ denote the over distance, gap distance, and union distance, respectively. These definatons are similar but slightly different from those in \cite{clrnet} and \cite{adnet}, with adjustments made to ensure the values are non-negative. This format is intended to maintain consistency with the IoU definitions used for bounding boxes. Therefore, the overall GLaneIoU between the $p$-th and $q$-th lane instances is given as follows: The defination of $\left\{d_{i,pq}^{\mathcal{O}}\right\}_{i=1}^{N}$, $\left\{d_{i,pq}^{\xi}\right\}_{i=1}^{N}$ and $\left\{d_{i,pq}^{\mathcal{U}}\right\}_{i=1}^{N}$ denote the over distance, gap distance, and union distance, respectively. These definitions closely resemble but slightly differ from those in \cite{clrnet} and \cite{adnet}, modifications to ensure non-negative values. This formulation aims to maintain consistency with the IoU definitions used for bounding boxes. Thus, the overall GLaneIoU between the $p$-th and $q$-th lane instances is expressed as:
\begin{align} \begin{align}
GIoU_{lane}\left( p,q \right)=\frac{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{O}}}}{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{U}}}}-g\frac{\sum\nolimits_{i=j}^k{d_{i,pq}^{\xi}}}{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{U}}}}, GIoU\left( p,q \right)=\frac{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{O}}}}{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{U}}}}-g\frac{\sum\nolimits_{i=j}^k{d_{i,pq}^{\xi}}}{\sum\nolimits_{i=j}^k{d_{i,pq}^{\mathcal{U}}}},
\end{align} \end{align}
where j and k are the indices of the start point and the end point, respectively. It's straightforward to observed that when $g=0$, the $GIoU_{lane}$ is correspond to IoU for bounding box, with a value range of $\left[0, 1 \right]$. When $g=1$, the $GIoU_{lane}$ is correspond to GIoU\cite{giouloss} for bounding box, with a value range of $\left(-1, 1 \right]$. where j and k are the indices of the start point and the end point, respectively. It's evident that when $g=0$, the $GIoU$ for lane pairs corresponds to that for bounding box, with a value range of $\left[0, 1 \right]$. When $g=1$, the $GIoU$ for lane pairs corresponds to that for bounding box, with a value range of $\left(-1, 1 \right]$.
% In general, when $g>0$, the value range of $GIoU_{lane}$ is $\left(-g, 1 \right]$. We set $g=0$ for cost function and IoU matrix in SimOTA, while $g=1$ for the loss function.
\label{giou_appendix} \label{giou_appendix}
@ -1041,28 +993,25 @@ where j and k are the indices of the start point and the end point, respectively
\caption{Label assignment and loss function for the triplet head.} \caption{Label assignment and loss function for the triplet head.}
\label{head_assign} \label{head_assign}
\end{figure} \end{figure}
We furnish the cost function and label assignments for the triplet head. We use dual label assignment strategy \cite{date} to assign label for triplet head, as illustrated in Fig. \ref{head_assign}. Specifically, we use one-to-many label assignments for both O2O classification head and O2M regression head. This part is almost the same as previous work \cite{clrernet}. In order to equip our model with NMS-free paradigm, we additionally add a O2O classification head and employ one-to-one label assignment to it. Details about cost function and label assignments for the triplet head are furnished here. A dual label assignment strategy \cite{date} is employed for the triplet head, as illustrated in Fig. \ref{head_assign}. Specifically, we implement one-to-many label assignments for both the O2O classification subhead and the O2M regression subhead. This section closely aligns with previous work \cite{clrernet}. To endow our model with NMS-free paradigm, we additionally incorporate the O2O classification subhead and apply a one-to-one label assignment to it.
The cost metric for one-to-one and one-to-many label assignments, are given as follows:
The cost metrics for both one-to-one and one-to-many label assignments are articulated as follows:
\begin{align} \begin{align}
\mathcal{C} _{p,q}^{o2o}=\tilde{s}_{p}^{g}\times \left( GIoU_{lane}\left( p,q \right) \right) ^{\beta} \label{o2o_cost},\\ \mathcal{C} _{p,q}^{o2o}=\tilde{s}_{p}^{g}\times \left( GIoU_{lane}\left( p,q \right) \right) ^{\beta} \label{o2o_cost},\\
\mathcal{C} _{p,q}^{o2m}=s_{p}^{g}\times \left( GIoU_{lane}\left( p,q \right) \right) ^{\beta}, \label{o2m_cost} \mathcal{C} _{p,q}^{o2m}=s_{p}^{g}\times \left( GIoU_{lane}\left( p,q \right) \right) ^{\beta}, \label{o2m_cost}
\end{align} \end{align}
where $\mathcal{C} _{pq}^{o2o}$ and $\mathcal{C} _{pq}^{o2m}$ are the cost metric between $p$-th prediction and $q$-th ground truth and $g$ in $GIoU_{lane}$ are set to $0$ to keep it non-negative. These metrics imply that both confidence score and geometric distance contributes to the cost metrics. where $\mathcal{C} _{pq}^{o2o}$ and $\mathcal{C} _{pq}^{o2m}$ denote the cost metric between $p$-th prediction and $q$-th ground truth and $g$ in $GIoU_{lane}$ are set to $0$ to ensure it maintains non-negative. These metrics imply that both the confidence score and geometric distance contribute to the cost metrics.
Suppose that there are $K$ predictions and $G$ ground truth. Let $\pi$ denotes some one-to-one label assignment strategy and $\pi(q)$ represents that $\pi(q)$-th predictions are assign to the $q$-th anchor. Additionally, $\mathscr{S}_{K, G}$ denotes the set of all possible one-to-one assignment strategies for K predictions and Q ground truth. It's easy to demonstrate that the total number of one-to-one assignment strategies $\left| \mathscr{S} _{K,G} \right|$ is $\frac{K!}{\left( K-G \right)!}$. The final assignment $\hat{\pi}$ are determined as follows: Suppose that there exist $K$ predictions and $G$ ground truth. Let $\pi$ denotes the one-to-one label assignment strategy and $\pi(q)$ represent that the $\pi(q)$-th prediction is assigned to the $q$-th anchor. Additionally, $\mathscr{S}_{K, G}$ denotes the set of all possible one-to-one assignment strategies for K predictions and G ground truth. It's straightforward to demonstrate that the total number of one-to-one assignment strategies $\left| \mathscr{S} _{K,G} \right|$ is $\frac{K!}{\left( K-G \right)!}$. The final optimal assignment $\hat{\pi}$ is determined as follows:
\begin{align} \begin{align}
\hat{\pi}=\underset{\pi \in \mathscr{S}_{K,G}}{arg\max}\sum_{q=1}^G{\mathcal{C} _{\pi \left( q \right) ,q}^{o2o}} \hat{\pi}=\underset{\pi \in \mathscr{S}_{K,G}}{arg\max}\sum_{q=1}^G{\mathcal{C} _{\pi \left( q \right) ,q}^{o2o}}.
\end{align} \end{align}
This assignment problem can be solved by Hungarian algorithm \cite{detr}. Finally, $G$ predictions are assigned as positive samples and $K-G$ predictons are assigned as negative samples. This assignment problem can be solved by Hungarian algorithm \cite{detr}. Finally, $G$ predictions are assigned as positive samples and $K-G$ predictions are assigned as negative samples.
In the one-to-many label assignment, we simply use SimOTA \cite{yolox}, which is the same as previous works \cite{clrernet}. Neglecting the detailed process of SimOTA, we only introduce the inputs of SimOTA, the cost matrix $\boldsymbol{M}^C\in \mathbb{R}^{G\times K}$ and the IoU matrix $\boldsymbol{M}^{IoU}\in \mathbb{R}^{G\times K}$. The elements in the two matrices are defined as $M^C_{qp}=\mathcal{C} _{p,q}^{o2m}$ and $M^{IoU}_{qp}= GIoU_{lane}\left( p,q \right)$ (with $g=0$), respectively. The number of assigned predictions for each ground truth is unfixed but no more than an upper bound $k_{dynamic}$, which is set to $4$ in our experiment. Finally, there are $K_{pos}$ positive samples and $K-K_{pos}$ negative samples, where $K_{pos}$ ranges from $0$ to $Gk_{dynamic}$.
Given the ground truth label generated by the label assignment strategy for each prediction, we can conduct the loss function during phase. As illustrated in Fig. \ref{head_assign}, $\mathcal{L}_{cls}^{o2o}$ and $\mathcal{L}_{rank}$ are for the O2O classification head, $\mathcal{L}_{cls}^{o2m}$ is for the O2M classification head whereas $\mathcal{L}_{GIOU}$ (with $g=1$), $\mathcal{L}_{end}$ and $\mathcal{L}_{aux}$ for the O2M head. The training phase of the O2M classification and regression heads are almost the same as previous works \cite{clrnet}.
In the one-to-many label assignment, we simply use SimOTA \cite{yolox}, which aligns with previous works \cite{clrernet}. Omitting the detailed process of SimOTA, we only introduce the inputs to it, namely the cost matrix $\boldsymbol{M}^C\in \mathbb{R}^{G\times K}$ and the IoU matrix $\boldsymbol{M}^{IoU}\in \mathbb{R}^{G\times K}$. The elements in the two matrices are defined as $M^C_{qp}=\mathcal{C} _{p,q}^{o2m}$ and $M^{IoU}_{qp}= GIoU_{lane}\left( p,q \right)$ (with $g=0$), respectively. The number of assigned predictions for each ground truth is variable but does not exceed an upper bound $k_{dynamic}$, which is set to $4$ in our experiment. Finally, there are $K_{pos}$ positive samples and $K-K_{pos}$ negative samples, where $K_{pos}$ ranges from $0$ to $Gk_{dynamic}$.
Given the ground truth label generated by the label assignment strategy for each prediction, we can conduct the loss function during phase. As illustrated in Fig. \ref{head_assign}, $\mathcal{L}_{cls}^{o2o}$ and $\mathcal{L}_{rank}$ are for the O2O classification subhead, $\mathcal{L}_{cls}^{o2m}$ is for the O2M classification subhead whereas $\mathcal{L}_{GIOU}$ (with $g=1$), $\mathcal{L}_{end}$ and $\mathcal{L}_{aux}$ for the O2M regression subhead.
\label{assign_appendix}
\begin{figure*}[t] \begin{figure*}[t]
@ -1197,7 +1146,7 @@ Given the ground truth label generated by the label assignment strategy for each
\end{subfigure} \end{subfigure}
\vspace{0.5em} \vspace{0.5em}
\caption{The visualization of the detection results of sparse scenarios.} \caption{Visualization of detection outcomes in sparse scenarios of four dataset.}
\label{vis_sparse} \label{vis_sparse}
\end{figure*} \end{figure*}
@ -1298,15 +1247,15 @@ Given the ground truth label generated by the label assignment strategy for each
\end{subfigure} \end{subfigure}
\vspace{0.5em} \vspace{0.5em}
\caption{The visualization of the detection results of sparse and dense scenarios on CurveLanes dataset.} \caption{Visualization of the detection outcomes in sparse and dense scenarios on the CurveLanes dataset.}
\label{vis_dense} \label{vis_dense}
\end{figure*} \end{figure*}
\section{The Supplement of Implement Detail and The Visualization Results.} \section{The Supplement of Implement Detail and Visualization Results.}
Some important implement details for each dataset is shown in Table \ref{dataset_info}. Some important implement details for each dataset are shown in Table \ref{dataset_info}. It includes the dataset information we employed to conduct experiments and visualizations, the parameters for data processing as well as hyperparameters of Polar R-CNN.
Fig. \ref{vis_sparse} shows the visualization results for sparse scenarios across four datasets. LPH effectively proposes anchors that are clustered around the ground truth, providing a robust prior for the RoI stage to achieve the final lane predictions. Moreover, the number of anchors has significantly decreased while maintaining accurate location around the ground truth compared to previous works, making our method faster than other anchor-based methods in theory. Fig. \ref{vis_sparse} illustrates the visualization outcomes in sparse scenarios spanning four datasets. The top row depicts the ground truth, while the middle row shows the proposed lane anchors and the bottom row exhibits the predictions generated by Polar-RCNN with NMS-free paradigm. In the top and bottom row, different colors aim to distinguish different lane instances, which do not correspond across the images. From images of the middle row, we can see that LPH of Polar R-CNN effectively proposes anchors that are clustered around the ground truth, providing a robust prior for GPH to achieve the final lane predictions. Moreover, the number of anchors has significantly decreased compared to previous works, making our method faster than other anchor-based methods in theory.
Fig. \ref{vis_dense} shows the visualization results for dense scenarios. We observe that NMS@50 mistakenly removes some predictions, leading to false negatives, while NMS@15 fails to eliminate redundant predictions, resulting in false positives. This highlights that the trade-off struggles between a large IoU threshold and a small IoU threshold. The visualization clearly demonstrates that geometric distance becomes less effective in dense scenarios. Only the O2O classification head, driven by data, can address this issue by capturing semantic distance beyond geometric distance. As shown in the last column of Fig. \ref{vis_dense}, the O2O classification head successfully eliminates redundant predictions while retaining dense predictions with small geometric distances. Fig. \ref{vis_dense} shows the visualization outcomes in dense scenarios. The first column displays the ground truth, while the second and the third columns reveal the detection results with NMS paradigm of large (\textit{i.e.} the default threshold NMS@50 with 50 pixels) and small (\textit{i.e.} the optimal threshold NMS@15 with 15 pixels) NMS thresholds, respectively. The final column shows the detection results with NMS-free paradigm. We observe that NMS@50 mistakenly removes some predictions, leading to false negatives, while NMS@15 fails to eliminate some redundant predictions, leading to false positives. This underscores that the trade-off struggles between the large NMS threshold and the small NMS threshold. The visualization distinctly demonstrates that distance becomes less effective in dense scenarios. Only the proposed O2O classification subhead, driven by data, can address this issue by capturing semantic distance beyond geometric distance. As shown in the last column of Fig. \ref{vis_dense}, the O2O classification subhead successfully eliminates redundant predictions while preserving dense predictions, despite their minimal geometric distances.
\label{vis_appendix} \label{vis_appendix}
\end{document} \end{document}

View File

@ -40,8 +40,8 @@ plt.plot(x_6x13, y_6x13, 'p-', color='orange', alpha=alpha, markersize=12, linew
plt.grid(True, linestyle='-', alpha=0.5) plt.grid(True, linestyle='-', alpha=0.5)
plt.xlabel("Anchor Proposal Number") # 横坐标名字 plt.xlabel("Anchor Proposal Number", fontsize=30) # 横坐标名字
plt.ylabel("F1@50 (%)") # 纵坐标名字 plt.ylabel("F1@50 (%)", fontsize=30) # 纵坐标名字
plt.legend(loc="lower right", title="Polarmap Size", title_fontsize=mpl.rcParams['legend.fontsize']) # 图例标题 plt.legend(loc="lower right", title="Polarmap Size", title_fontsize=mpl.rcParams['legend.fontsize']) # 图例标题
plt.savefig('anchor_num_testing.png', dpi=300) plt.savefig('anchor_num_testing.png', dpi=300)
plt.show() plt.show()

View File

@ -43,8 +43,8 @@ y_6x13 = [90.845, 89.979, 89.492, 89.237, 89.101, 89.055, 89.018, 88.999, 88.996
plt.plot(x_6x13, y_6x13, 'p-', color='orange', alpha=alpha, markersize=12, linewidth=4, label="6*13") plt.plot(x_6x13, y_6x13, 'p-', color='orange', alpha=alpha, markersize=12, linewidth=4, label="6*13")
plt.grid(True, linestyle='-', alpha=0.5) plt.grid(True, linestyle='-', alpha=0.5)
plt.xlabel("Anchor Proposal Number") # 横坐标名字 plt.xlabel("Anchor Proposal Number", fontsize=30) # 横坐标名字
plt.ylabel("Precision@50 (%)") # 纵坐标名字 plt.ylabel("Precision@50 (%)", fontsize=30) # 纵坐标名字
plt.legend(loc="lower right", title="Polarmap Size", title_fontsize=mpl.rcParams['legend.fontsize']) # 图例标题 plt.legend(loc="lower right", title="Polarmap Size", title_fontsize=mpl.rcParams['legend.fontsize']) # 图例标题
plt.savefig('anchor_num_testing_p.png', dpi=300) plt.savefig('anchor_num_testing_p.png', dpi=300)
plt.show() plt.show()

View File

@ -39,8 +39,8 @@ y_6x13 = [69.227, 72.876, 73.825, 74.077, 74.174, 74.210, 74.232, 74.249, 74.260
plt.plot(x_6x13, y_6x13, 'p-', color='orange', alpha=alpha, markersize=12, linewidth=4, label="6*13") plt.plot(x_6x13, y_6x13, 'p-', color='orange', alpha=alpha, markersize=12, linewidth=4, label="6*13")
plt.grid(True, linestyle='-', alpha=0.5) plt.grid(True, linestyle='-', alpha=0.5)
plt.xlabel("Anchor Proposal Number") # 横坐标名字 plt.xlabel("Anchor Proposal Number", fontsize=30) # 横坐标名字
plt.ylabel("Recall@50 (%)") # 纵坐标名字 plt.ylabel("Recall@50 (%)", fontsize=30) # 纵坐标名字
plt.legend(loc="lower right", title="Polarmap Size", title_fontsize=mpl.rcParams['legend.fontsize']) # 图例标题 plt.legend(loc="lower right", title="Polarmap Size", title_fontsize=mpl.rcParams['legend.fontsize']) # 图例标题
plt.savefig('anchor_num_testing_r.png', dpi=300) plt.savefig('anchor_num_testing_r.png', dpi=300)
plt.show() plt.show()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 156 KiB

After

Width:  |  Height:  |  Size: 163 KiB

View File

@ -18,7 +18,7 @@ data = {
'ADNet (2023)': {'x': [64, 64], 'y': [77.56, 78.94], 'sizes': [80*2.5, 180*2.5], 'color': 'green', 'marker': 'v'}, 'ADNet (2023)': {'x': [64, 64], 'y': [77.56, 78.94], 'sizes': [80*2.5, 180*2.5], 'color': 'green', 'marker': 'v'},
'SRLane (2024)': {'x': [40], 'y': [79.73], 'sizes': [180*2.5], 'color': 'red', 'marker': '*'}, 'SRLane (2024)': {'x': [40], 'y': [79.73], 'sizes': [180*2.5], 'color': 'red', 'marker': '*'},
'Sparse Laneformer (2024)': {'x': [20, 20, 20], 'y': [76.55, 77.77, 77.83], 'sizes': [40*2.5, 80*2.5, 180*2.5], 'color': 'purple', 'marker': '^'}, 'Sparse Laneformer (2024)': {'x': [20, 20, 20], 'y': [76.55, 77.77, 77.83], 'sizes': [40*2.5, 80*2.5, 180*2.5], 'color': 'purple', 'marker': '^'},
'PolarRCNN (Ours)': {'x': [20, 20, 20, 20], 'y': [80.81, 80.92, 81.34, 81.49], 'sizes': [20*2.5, 40*2.5, 80*2.5, 180*2.5], 'color': 'blue', 'marker': 'o'}, 'Polar R-CNN (Ours)': {'x': [20, 20, 20, 20], 'y': [80.81, 80.92, 81.34, 81.49], 'sizes': [20*2.5, 40*2.5, 80*2.5, 180*2.5], 'color': 'blue', 'marker': 'o'},
} }
# 定义统一的标记大小 # 定义统一的标记大小
@ -39,8 +39,8 @@ for label, props in data.items():
# 设置标题和标签 # 设置标题和标签
plt.grid(True, linestyle='-', alpha=0.5) plt.grid(True, linestyle='-', alpha=0.5)
plt.xlabel('Anchor Proposal Number') plt.xlabel('Anchor Proposal Number', fontsize=16)
plt.ylabel('F1@50 (%)') plt.ylabel('F1@50 (%)', fontsize=16)
# 添加图例,并调整图例中的标记大小 # 添加图例,并调整图例中的标记大小
legend = plt.legend(loc="best") legend = plt.legend(loc="best")

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 MiB

After

Width:  |  Height:  |  Size: 1.6 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

BIN
thesis_figure/elu_proof.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.3 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 189 KiB

After

Width:  |  Height:  |  Size: 197 KiB

View File

@ -19,8 +19,8 @@ data = {
'ADNet (2023)': {'x': [8.4, 10.67], 'y': [77.56, 78.94], 'color': 'green', 'marker': 'v'}, 'ADNet (2023)': {'x': [8.4, 10.67], 'y': [77.56, 78.94], 'color': 'green', 'marker': 'v'},
'SRLane (2024)': {'x': [3.12], 'y': [79.73], 'color': 'red', 'marker': '*'}, 'SRLane (2024)': {'x': [3.12], 'y': [79.73], 'color': 'red', 'marker': '*'},
'UFLDv2 (2022)': {'x': [2.7, 4.6], 'y': [75, 76], 'color': 'purple', 'marker': '^'}, 'UFLDv2 (2022)': {'x': [2.7, 4.6], 'y': [75, 76], 'color': 'purple', 'marker': '^'},
'PolarRCNN-NMS (ours)': {'x': [3.71, 4.97, 5.47, 6.14], 'y': [80.81, 80.92, 81.49, 81.34], 'color': 'blue', 'marker': 'o'}, 'Polar R-CNN-NMS (ours)': {'x': [3.71, 4.97, 5.47, 6.14], 'y': [80.81, 80.92, 81.49, 81.34], 'color': 'blue', 'marker': 'o'},
'PolarRCNN (ours)': {'x': [4.77, 6.10, 6.54, 7.13], 'y': [80.81, 80.92, 81.49, 81.34], 'color': 'cyan', 'marker': 'o'}, 'Polar R-CNN (ours)': {'x': [4.77, 6.10, 6.54, 7.13], 'y': [80.81, 80.92, 81.49, 81.34], 'color': 'cyan', 'marker': 'o'},
} }
@ -43,8 +43,8 @@ for label, props in data.items():
# 设置标题和标签 # 设置标题和标签
plt.grid(True, linestyle='-', alpha=0.5) plt.grid(True, linestyle='-', alpha=0.5)
plt.xlabel('Latency (ms) on NVIDIA A100') plt.xlabel('Latency (ms) on NVIDIA A100', fontsize=16)
plt.ylabel('F1@50 (%)') plt.ylabel('F1@50 (%)', fontsize=16)
# 添加图例,并调整图例中的标记大小 # 添加图例,并调整图例中的标记大小
legend = plt.legend(loc="upper right") legend = plt.legend(loc="upper right")

BIN
thesis_figure/sunkai.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB