This commit is contained in:
王老板 2024-11-01 22:43:46 +08:00
parent 527fa91af0
commit 24982236e5
4 changed files with 72 additions and 59 deletions

114
main.tex
View File

@ -134,7 +134,7 @@ To address the above two issues, we propose Polar R-CNN, a novel anchor-based me
\begin{figure*}[ht]
\centering
\includegraphics[width=0.99\linewidth]{thesis_figure/ovarall_architecture.png}
\caption{An illustration of the Polar R-CNN architecture. It has a similar pipeline with the Faster R-CNN for the task of object detection, and consists of a backbone, a \textit{Feature Pyramid Network} with three levels of feature maps, respectively denote by $P_1, P_2, P_3$, followed by a \textit{Local Polar Module}, and a \textit{Global Polar Module} for lane detection. Based on the designed lane representation and lane anchor representation in polar coordinate system, the local polar module can propose sparse line anchors and the global polar module can produce the final accurate lane predictions. The global polar module includes a triplet head, which comprises the \textit{one-to-one} (O2O) classification subhead, the \textit{one-to-many} (O2M) classification subhead, and the \textit{one-to-many} (O2M) regression subhead.}
\caption{An illustration of the Polar R-CNN architecture. It has a similar pipeline with the Faster R-CNN for the task of object detection, and consists of a backbone, a \textit{Feature Pyramid Network} with three levels of feature maps, respectively denote by $\boldsymbol{P}_1$, $\boldsymbol{P}_2$ and $\boldsymbol{P}_3$, followed by a \textit{Local Polar Module}, and a \textit{Global Polar Module} for lane detection. Based on the designed lane representation and lane anchor representation in polar coordinate system, the local polar module can propose sparse line anchors and the global polar module can produce the final accurate lane predictions. The global polar module includes a triplet head, which comprises the \textit{one-to-one} (O2O) classification subhead, the \textit{one-to-many} (O2M) classification subhead, and the \textit{one-to-many} (O2M) regression subhead.}
\label{overall_architecture}
\end{figure*}
\section{Related Works}
@ -168,7 +168,7 @@ In this work, we aim to address the above two issues in the framework of anchor-
\includegraphics[width=\imgwidth]{thesis_figure/coord/polar.png}
\caption{}
\end{subfigure}
\caption{Different descriptions for anchor parameters: (a) Ray: defined by its start point (\textit{e.g.} the green point $\left( x_{1}^{s},y_{1}^{s} \right)$ or the yellow point $\left( x_{2}^{s},y_{2}^{s} \right) $) and direction $\theta^{s}$. (b) Polar: defined by its radius $r$ and angle $\theta$.} %rectangular coordinates
\caption{Different descriptions for anchor parameters: (a) Ray: defined by its start point (\textit{e.g.}, the green point $\left( x_{1}^{s},y_{1}^{s} \right)$ or the yellow point $\left( x_{2}^{s},y_{2}^{s} \right) $) and direction $\theta^{s}$. (b) Polar: defined by its radius $r$ and angle $\theta$.} %rectangular coordinates
\label{coord}
\end{figure}
%
@ -194,11 +194,11 @@ However, the representation of lane anchors as rays presents certain limitations
\par
To better leverage the local inductive bias properties of CNNs, we define two types of polar coordinate systems: the local and global coordinate systems. The local polar coordinate system is to generate lane anchors, while the global coordinate system expresses these anchors in a form within the entire image and regresses them to the ground truth lane instances. Given the distinct roles of the local and global systems, we adopt a two-stage framework for our Polar R-CNN, similar to Faster R-CNN\cite{fasterrcnn}.
\par
The local polar system is designed to predict lane anchors adaptable to both sparse and dense scenarios. In this system, there are many poles with each as the lattice point of the feature map, referred to as local poles. As illustrated on the left side of Fig. \ref{lpmlabel}, there are two types of local poles: positive and negative. Positive local poles (\textit{e.g.}, the blue points) have a radius $r_{i}^{l}$ below a threshold $\lambda^l$, otherwise, they are classified as negative local poles (\textit{e.g.}, the red points). Each local pole is responsible for predicting a single lane anchor. While a lane ground truth may generate multiple lane anchors, as shown in Fig. \ref{lpmlabel}, there are three positive poles around the lane instance (green lane), which are expected to generate three lane anchors.
The local polar system is designed to predict lane anchors adaptable to both sparse and dense scenarios. In this system, there are many poles with each as the lattice point of the feature map, referred to as local poles. As illustrated on the left side of Fig. \ref{lpmlabel}, there are two types of local poles: positive and negative. Positive local poles (\textit{i.e.}, the blue points) have a radius $r_{i}^{l}$ below a threshold $\lambda^l$, otherwise, they are classified as negative local poles (\textit{i.e.}, the red points). Each local pole is responsible for predicting a single lane anchor. While a lane ground truth may generate multiple lane anchors, as shown in Fig. \ref{lpmlabel}, there are three positive poles around the lane instance (green lane), which are expected to generate three lane anchors.
%This one-to-many approach is essential for ensuring comprehensive anchor proposals, especially since some local features around certain poles may be lost due to damage or occlusion of the lane curve.
\par
In the local polar coordinate system, the parameters of each lane anchor are determined based on the location of its corresponding local pole. However, in practical terms, once a lane anchor is generated, its definitive position becomes immutable and independent of its original local pole. To simplify the representation of lane anchors in the second stage of Polar-RCNN, a global polar system has been designed, featuring a singular and unified pole that serves as a reference point for the entire image. The location of this global pole is manually set, and in this case, it is positioned near the static \textit{vanishing point} observed across the entire lane image dataset\cite{Vpoint}. This approach ensures a consistent and unified polar coordinate for expressing lane anchors within the global context of the image, facilitating accurate regression to the ground truth lane instances.
In the local polar coordinate system, the parameters of each lane anchor are determined based on the location of its corresponding local pole. However, in practical terms, once a lane anchor is generated, its definitive position becomes immutable and independent of its original local pole. To simplify the representation of lane anchors in the second stage of Polar-RCNN, a global polar system has been designed, featuring a singular and unified pole that serves as a reference point for the entire image. The location of this global pole is manually set, and in this case, it is positioned near the static \textit{vanishing point} observed across the entire lane image dataset \cite{vanishing}. This approach ensures a consistent and unified polar coordinate for expressing lane anchors within the global context of the image, facilitating accurate regression to the ground truth lane instances.
\begin{figure}[t]
\centering
@ -213,20 +213,20 @@ As shown in Fig. \ref{overall_architecture}, three levels of feature maps, denot
\end{equation}
The downsampled feature map $\boldsymbol{F}_d$ is then fed into two branches: a \textit{regression} branch $\phi _{reg}^{l}\left(\cdot \right)$ and a \textit{classification} branch $\phi _{cls}^{l}\left(\cdot \right)$, \textit{i.e.},
\begin{align}
\boldsymbol{F}_{reg}\gets \phi _{reg}^{l}\left( \boldsymbol{F}_d \right)\ &\text{and}\ \boldsymbol{F}_{reg\,\,}\in \mathbb{R} ^{2\times H^{l}\times W^{l}},\\
\boldsymbol{F}_{cls}\gets \phi _{cls}^{l}\left( \boldsymbol{F}_d \right)\ &\text{and}\ \boldsymbol{F}_{cls}\in \mathbb{R} ^{H^{l}\times W^{l}}. \label{lpm equ}
\boldsymbol{F}_{reg}&\gets \phi _{reg}^{l}\left( \boldsymbol{F}_d \right),\,\,\boldsymbol{F}_{reg\,\,}\in \mathbb{R} ^{2\times H^{l}\times W^{l}},\\
\boldsymbol{F}_{cls}&\gets \phi _{cls}^{l}\left( \boldsymbol{F}_d \right),\,\,\boldsymbol{F}_{cls}\in \mathbb{R} ^{H^{l}\times W^{l}}. \label{lpm equ}
\end{align}
The regression branch consists of a single $1\times1$ convolutional layer and with the goal of generating lane anchors by outputting their angles $\theta_j$ and the radius $r^{l}_{j}$, \textit{i.e.}, $\boldsymbol{F}_{reg\,\,} \equiv \left\{\theta_{j}, r^{l}_{j}\right\}_{j=1}^{H^{l}\times W^{l}}$, in the defined local polar coordinate system previously introduced. Similarly, the classification branch $\phi _{cls}^{l}\left(\cdot \right)$ only consists of two $1\times1$ convolutional layers for simplicity. This branch is to predict the confidence heat map $\boldsymbol{F}_{cls\,\,}\equiv \left\{ s_j^l \right\} _{j=1}^{H^l\times W^l}$ for local poles, each associated with a feature point. By discarding local poles with lower confidence, the module increases the likelihood of selecting potential positive foreground lane anchors while effectively removing background lane anchors.
\par
\textbf{Loss Function for LPM.} To train the LPM, we define the ground truth labels for each local pole as follows: the ground truth radius, $\hat{r}^l_i$, is set to be the minimum distance from a local pole to the corresponding lane curve, while the ground truth angle, $\hat{\theta}_i$, is set to be the orientation of the vector extending from the local pole to the nearest point on the curve. Consequently, we have a label set of local poles $\hat{\boldsymbol{F}}_{cls}=\{\hat{s}_j^l\}_{j=1}^{H^l\times W^l}$, where $\hat{s}_j^l=1$ if the $j$-th local pole is positive and $\hat{s}_j^l=0$ if it is negative. Once the regression and classification labels are established, as shown in Fig. \ref{lpmlabel}, LPM can be trained using the $Smooth_{L1}$ loss $S_{L1}\left(\cdot \right)$ for regression branch and the \textit{Binary Cross-Entropy} loss $BCE\left( \cdot , \cdot \right)$ for classification branch. The loss functions for LPM are given as follows:
\begin{align}
\mathcal{L} ^{l}_{cls}&=BCE\left( \boldsymbol{F}_{cls},\hat{\boldsymbol{F}}_{cls} \right)\\
\mathcal{L} _{reg}^{l}&=\frac{1}{N_{pos}^{l}}\sum_{j\in \left\{ j|\hat{r}_{j}^{l}<\lambda^l \right\}}{\left( S_{L1}\left( \theta _{j}^{l}-\hat{\theta}_{j}^{l} \right) +S_{L1}\left( r_{j}^{l}-\hat{r}_{j}^{l} \right) \right)}
\mathcal{L} ^{l}_{cls}&=BCE\left( \boldsymbol{F}_{cls},\hat{\boldsymbol{F}}_{cls} \right),\\
\mathcal{L} _{reg}^{l}&=\frac{1}{N_{pos}^{l}}\sum_{j\in \left\{ j|\hat{r}_{j}^{l}<\lambda^l \right\}}{\left( S_{L1}\left( \theta _{j}^{l}-\hat{\theta}_{j}^{l} \right) +S_{L1}\left( r_{j}^{l}-\hat{r}_{j}^{l} \right) \right)},
\label{loss_lph}
\end{align}
where $N^{l}_{pos}=\left|\{j|\hat{r}_j^l<\lambda^{l}\}\right|$ is the number of positive local poles in LPM.
\par
\textbf{Top-$K$ Anchor Selection.} As discussed above, all $H^{l}\times W^{l}$ anchors, each associated with a local pole in the feature map, are all considered as candidates during the training stage. However, some of these anchors serve as background anchors. We select $K$ anchors with the top-$K$ highest confidence scores as the foreground candidates to feed into the second stage (\textit{i.e.}, global polar module). During training, all anchors are chosen as candidates, where $K=H^{l}\times W^{l}$ assists it assists \textit{Global Polar Module} (the second stage) in learning from a diverse range of features, including various negative background anchor samples. Conversely, during the evaluation stage, some anchors with lower confidence can be excluded such that $K\leq H^{l}\times W^{l}$. This strategy effectively filters out potential negative anchors and reduces the computational complexity of the second stage. By doing so, it maintains the adaptability and flexibility of anchor distribution while decreasing the total number of anchors especially in the sparse scenarios. The following experiments will demonstrate the effectiveness of different top-$K$ anchor selection strategies.
\textbf{Top-$K$ Anchor Selection.} As discussed above, all $H^{l}\times W^{l}$ anchors, each associated with a local pole in the feature map, are all considered as candidates during the training stage. However, some of these anchors serve as background anchors. We select $K$ anchors with the top-$K$ highest confidence scores as the foreground candidates to feed into the second stage (\textit{i.e.}, global polar module). During training, all anchors are chosen as candidates, where $K=H^{l}\times W^{l}$. It assists \textit{Global Polar Module} (the second stage) in learning from a diverse range of features, including various negative background anchor samples. Conversely, during the evaluation stage, some anchors with lower confidence can be excluded such that $K\leq H^{l}\times W^{l}$. This strategy effectively filters out potential negative anchors and reduces the computational complexity of the second stage. By doing so, it maintains the adaptability and flexibility of anchor distribution while decreasing the total number of anchors especially in the sparse scenarios. The following experiments will demonstrate the effectiveness of different top-$K$ anchor selection strategies.
\begin{figure}[t]
\centering
@ -251,18 +251,18 @@ where $\boldsymbol{c}^{g} \in \mathbb{R}^{2}$ and $\boldsymbol{c}^{l}_{j} \in \m
where the y-coordinates $\boldsymbol{y}_{j}\equiv \{y_{1,j},y_{2,j},\cdots ,y_{N,j}\}$ of the $j$-th lane anchor are uniformly sampled vertically from the image, as previously mentioned. The proof of Eqs. (\ref{l2g})-(\ref{positions}) can be found in Appendix \ref{proof_l2g}. Then coordinates of the $j$-th lane anchor can be given by $\boldsymbol{\ell}_j=\{\boldsymbol{x}_{j},\boldsymbol{y}_j\}\equiv \left\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots ,(x_{N,j}, y_{N,j})\right\}$.
\par
Given the different level feature maps $\boldsymbol{P}_1, \boldsymbol{P}_2, \boldsymbol{P}_3$ from FPN, we can extract the channel-wise features of each point corresponding to the positions of $\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots,(x_{N,j},y_{N,j})\}_{j=1}^{K}$, respectively denoted as $\boldsymbol{F}_{1,j}, \boldsymbol{F}_{2,j}, \boldsymbol{F}_{3,j}\in \mathbb{R} ^{N\times C_f}$. To enhance representation, similar to \cite{srlane}, we employ a weighted sum strategy to combine features from the three levels by:
\begin{equation}
\begin{align}
\boldsymbol{F}^s_j=\sum_{k=1}^3{\frac{e^{\boldsymbol{w}_{k}}}{\sum_{k=1}^3{e^{\boldsymbol{w}_{k}}}}\circ \boldsymbol{F}_{k,j}},
\end{equation}
\end{align}
where $\boldsymbol{w}_{k}\in \mathbb{R}^{N}$ represents trainable aggregate weight ascribed to $N$ sampled points, and the symbol ``$\circ$'' represents element-wise multiplication (\textit{i.e.}, Hadamard product). Instead of concatenating the three sampling features into $\boldsymbol{F}^s_j\in \mathbb{R} ^{N\times 3C_f}$ directly, the adaptive summation significantly reduces the feature dimensions to $\boldsymbol{F}^s_j\in \mathbb{R} ^{N\times C_f}$, which is one-third of the initial dimension. The weighted sum of the tensors is flattened into a vector $\widehat{\boldsymbol{F}}^s_j\in \mathbb{R} ^{NC_f}$, and then integrated through a linear transformation:
\begin{align}
\boldsymbol{F}_{j}^{roi}\gets \boldsymbol{W}_{pool}\widehat{\boldsymbol{F}}_{j}^{s},\quad j=1,2,\cdots,K\notag.
\boldsymbol{F}_{j}^{roi}\gets \boldsymbol{W}_{pool}\widehat{\boldsymbol{F}}_{j}^{s},\quad j=1,2,\cdots,K.
\end{align}
Here $\boldsymbol{W}_{pool}\in \mathbb{R} ^{d_r\times NC_f}$ is employed to further reduce the dimension of integrated feature $\widehat{\boldsymbol{F}}_{j}^{s}$, thereby yielding the final RoI features $\{\boldsymbol{F}_{i}^{roi}\in \mathbb{R} ^{d_r}\}_{i=1}^K$, which are feed to the following triplet head.
Here, $\boldsymbol{W}_{pool}\in \mathbb{R} ^{d_r\times NC_f}$ is employed to further reduce the dimension of integrated feature $\widehat{\boldsymbol{F}}_{j}^{s}$, thereby yielding the final RoI features $\{\boldsymbol{F}_{i}^{roi}\in \mathbb{R} ^{d_r}\}_{i=1}^K$, which are feed to the following triplet head.
\par
\textbf{Triplet Head.} The lane detection head is to classify and regress the lane anchors generated from the LPM based on the ROI pooling features $\{\boldsymbol{F}_{i}^{roi}\in \mathbb{R} ^{d_r}\}_{i=1}^K$. As we know, traditional lane detection head\cite{laneatt} is usually equipped with a \textit{One-to-Many} (O2M) classification subhead and a \textit{One-to-Many} (O2M) regression subhead. However, the one-to-many mechanism (\textit{i.e.}, \textit{many candidates for one ground truth}) will cause redundant predictions for each lane, thus need the NMS post-processing operator. While the NMS is non-differentiable and non-end-to-end, resulting in the challenges of manually setting of hyperparameters and suboptimal of performance. To eliminate NMS post-processing while achieving end-to-end learning, we introduce a triplet head module for lane detection.
\par
As shown in Fig. \ref{gpm}, the triplet head consists of three components: the O2M classification, the O2M regression, and another \textit{One-to-One} (O2O) classification. The features of each lane anchor $\{\boldsymbol{F}_{j}^{roi}\}$ are feed into three subheads, respectively. To keep both simplicity and efficiency, both the O2M classification and O2M regression subheads apply two \textit{multi-layer perceptions} (MLPs) to $\{\boldsymbol{F}_{j}^{roi}\}$ and then generate the confidence scores $\left\{{s}_j^g\right\}$ by the O2M classification subhead and the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ by the O2M regression subhead for each lane anchor. More details about the O2M classification and O2M regression subheads can be referred to \cite{yolox}. The O2O classification subhead is introduced to generate non-redundant lane candidates within a NMS-free paradigm. However, the direct use of one-to-one strategy (\textit{i.e.}, \textit{assigning one positive anchor for one ground truth lane}) based on the extracted features will damage model's learning\cite{dualassign}\cite{yolov10}. Instead, the proposed O2O classification subhead considers both the \textit{confidence prior} $\left\{{s}_j^g\right\}$ of O2M classification subhead and the \textit{spatial geometric prior} of the polar parameters (\textit{i.e.}, the angle $\theta$ and the radius $r$), and apply these priors to adaptively refine the lane anchor features $\{\boldsymbol{F}_{j}^{roi}\}$, which generates the refined lane anchor features $\{\boldsymbol{D}_{j}^{roi}\}$ and the confidence scores $\left\{\tilde{s}_j^g\right\}$. The structural design draws inspiration from the Fast NMS \cite{yolact}, with further particulars accessible in the Appendix \ref{NMS_appendix}.
As shown in Fig. \ref{gpm}, the triplet head consists of three components: the O2M classification, the O2M regression, and another \textit{One-to-One} (O2O) classification. The features of each lane anchor $\{\boldsymbol{F}_{j}^{roi}\}$ are fed into the aforementioned three subheads, respectively. To keep both simplicity and efficiency, both the O2M classification and O2M regression subheads apply two \textit{multi-layer perceptions} (MLPs) to $\{\boldsymbol{F}_{j}^{roi}\}$ and then generate the confidence scores $\left\{{s}_j^g\right\}$ by the O2M classification subhead and the x-coordinate offsets $\{\Delta\boldsymbol{x}_j\}$ by the O2M regression subhead for each lane anchor. More details about the O2M classification and O2M regression subheads can be referred to \cite{yolox}. The O2O classification subhead is introduced to generate non-redundant lane candidates within a NMS-free paradigm. However, the direct use of one-to-one strategy (\textit{i.e.}, \textit{assigning one positive anchor for one ground truth lane}) based on the extracted features will damage model's learning\cite{dualassign}\cite{yolov10}. Instead, the proposed O2O classification subhead considers both the \textit{confidence prior} $\left\{{s}_j^g\right\}$ of O2M classification subhead and the \textit{spatial geometric prior} of the polar parameters (\textit{i.e.}, the angle $\theta$ and the radius $r$), and apply these priors to adaptively refine the lane anchor features $\{\boldsymbol{F}_{j}^{roi}\}$, which generates the refined lane anchor features $\{\boldsymbol{D}_{j}^{roi}\}$ and the confidence scores $\left\{\tilde{s}_j^g\right\}$. The structural design draws inspiration from the Fast NMS \cite{yolact}, with further particulars accessible in the Appendix \ref{NMS_appendix}.
\par
More specifically, the O2O classification subhead first calculates the \textit{semantic distance} between the $i$-th anchor with its x-coordinate $\boldsymbol{x}_{i}$ and the $j$-th anchor with its x-coordinate $\boldsymbol{x}_{j}$ as follows:
\begin{align}
@ -277,20 +277,20 @@ where $\boldsymbol{D}_{ij}^{edge}\in \mathbb{R}^{d_n}$ denotes the implicit sema
where $\odot$ is the element-wise multiplication, $\boldsymbol{A}^C\in\mathbb{R}^{K\times K}$ and $\boldsymbol{A}^G\in\mathbb{R}^{K\times K}$ are the confidence-prior adjacency matrix and the geometric-prior adjacency matrix, respectively. The confidence-prior adjacency matrix $\boldsymbol{A}^C=\left(A_{ij}^C\right)_{i,j=1}^K$ is defined as follows:
\begin{align}
A_{ij}^{C}=\begin{cases}
1,\, if \,s_i^g>s_j^g\,\,or\,\,( s_i^g=s_j^g\,\,and\,\,i>j );\\
0,\,\,others.
1,\, \mathrm{if}\,\,s_i^g>s_j^g\,\,or\,\,( s_i^g=s_j^g\,\,and\,\,i>j );\\
0,\,\mathrm{others}.
\end{cases}
\label{confidential matrix1}
\end{align}
Here, $s_i^g$ and $s_j^g$ are the confidence scores corresponding to the $i$-th and the $j$-th lane anchors and predicted by the O2M classification subhead. According to Eq. \eqref{confidential matrix1}, the role of $\boldsymbol{A}^C$ is to allow lane anchors with higher confidence scores to suppress those with lower scores. In order to leverage geometric priors and based on the representation in polar coordinate (\textit{i.e}, the global polar radius $r^g$ and angle $\theta$), we further introduce geometric-prior adjacency matrix $\boldsymbol{A}^G=\left(A_{ij}^G\right)_{i,j=1}^K$, defined by
\begin{align}
A_{ij}^{G}=\begin{cases}
1,\, if \,\left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\lambda^g;\\
0,\,\,others,
1,\, \mathrm{if}\,\,\left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\lambda^g;\\
0,\,\mathrm{others},
\end{cases}
\label{geometric prior matrix1}
\end{align}
where $\tau^{\theta}$ and $\lambda^g$ are the thresholds to measure the geometric distances. Based on the definition of geometric-prior and confidence-prior adjacency matrices, the overall adjacency matrix $\boldsymbol{A}$ can be seen as a directed graph with each lane anchor as a node and the ROI features $\boldsymbol{F}_i^{roi}$ serving as their input features. Specifically, if an element $A_{ij}$ in $\boldsymbol{A}$ equals to 1, a directed edge exists from the $i$-th anchor and the $j$-th anchor, which implies that the $j$-th anchor may be suppressed by the $i$-th anchor when the confidence score of the $i$-th anchor exceeds that of the $j$-th anchor and their geometric distance is sufficiently small (\textit{i.e.}, less that a predefined threshold).
where $\tau^{\theta}$ and $\lambda^g$ are the thresholds to measure the geometric distances. Based on the definition of geometric-prior and confidence-prior adjacency matrices, the overall adjacency matrix $\boldsymbol{A}$ can be seen as a directed graph with each lane anchor as a node and the ROI features $\boldsymbol{F}_i^{roi}$ serving as their input features. Specifically, if an element $A_{ij}$ in $\boldsymbol{A}$ equals to 1, a directed edge exists from the $i$-th anchor and the $j$-th anchor, which implies that the $j$-th anchor may be suppressed by the $i$-th anchor when the confidence score of the $i$-th anchor exceeds that of the $j$-th anchor and their geometric distance is sufficiently small (\textit{i.e.}, less than a predefined threshold).
\par
And then, by considering the suppressive effect of the lane anchors induced by the overall adjacency matrix $\boldsymbol{A}$, the lane anchor features $\boldsymbol{F}_j^{roi}$ can be further refined from the semantic distance tensor $\mathcal{D}^{edge}=\{\boldsymbol{D}_{ij}^{edge}\}\in\mathbb{R}^{K\times K\times d_n}$ as follows:
\begin{align}
@ -301,11 +301,11 @@ where $j=1,2,\cdots,K$ and $\mathrm{MPool}_{col}(\cdot|\boldsymbol{A}(:,j)=1)$ i
\begin{align}
\tilde{s}_{j}^{g}\gets \mathrm{MLP}_{roi}\left( \boldsymbol{D}_{j}^{roi} \right), j=1,\cdots,K. \label{node_layer}
\end{align}
As stated above, the O2O classification subhead is formed from Eqs. (\ref{edge_layer_1})-(\ref{node_layer}), which can be seen as a directed graph driven by neural networks. The structure in O2O classification subhead is referred to as \textit{graph neural network} (GNN) block.
As stated above, the O2O classification subhead is formed from Eqs. (\ref{edge_layer_1})-(\ref{node_layer}), which can be seen as a directed graph driven by neural networks, which is referred to as the \textit{graph neural network} (GNN) block.
\par
\textbf{Dual Confidence Selection with NMF-free.} With the help of adjacency matrix $A$, the variability among semantic features $\{\boldsymbol{D}_j^{roi}\}$ has been enlarged, resulting in a significant gap in confidence scores $\{\tilde{s}_{j}^{g}\}$ generated by O2O classification subhead, which makes them easier to distinguish. Therefore, unlike conventional methods that feed the confidence scores $\{\tilde{s}_{j}^{g}\}$ obtained by O2M classification subhead into the NMS post-processing stage to remove redundant candidates, we have implemented the following dual confidence selection criterion for selecting positive anchors:
\begin{align}
\Omega^{pos}=\left\{i|\tilde{s}_{i}^{g}>\tau_{o2o} \right\} \cap \left\{ i|s_{i}^{g}>\tau_{o2m} \right\}
\Omega^{pos}=\left\{i|\tilde{s}_{i}^{g}>\tau_{o2o} \right\} \cap \left\{ i|s_{i}^{g}>\tau_{o2m} \right\},
\end{align}
where $\tau_{o2o}$ and $\tau_{o2m}$ are two confidence thresholds. The $\Omega^{pos}$ can allow for non-redundant positive predictions without NMS post-processing as the O2O classification subhead enhances the confidence score variability among similar anchors, making it less sensitive to the two confidence thresholds.
\par
@ -340,18 +340,18 @@ We use the F1-score to evaluate our model on the CULane, LLAMAS, DL-Rail, and Cu
\begin{align}
Pre\,\,&=\,\,\frac{TP}{TP+FP},
\\
Rec\,\,&=\,\,\frac{TP}{TP+FN}.
Rec\,\,&=\,\,\frac{TP}{TP+FN},
\\
F1&=\frac{2\times Pre\times Rec}{Pre\,\,+\,\,Rec},
\end{align}
where $TP$, $FP$ and $FN$ represent the true positives, false positives, and false negatives of the entire dataset, respectively. In our experiment, we use different IoU thresholds to calculate the F1-score for different datasets: $F1@50$ and $F1@75$ for CULane \cite{clrnet}, $F1@50$ for LLAMAS \cite{clrnet} and Curvelanes \cite{CondLaneNet}, and $F1@50$, $F1@75$, and $mF1$ for DL-Rail \cite{dalnet}. The $mF1$ is defined as:
\begin{align}
mF1=\left( F1@50+F1@55+\ldots+F1@95 \right) /10.
mF1=\left( F1@50+F1@55+\ldots+F1@95 \right) /10,
\end{align}
where $F1@50, F1@55, \ldots, F1@95$ are F1 metrics when IoU thresholds are $0.5, 0.55, \ldots, 0.95$, respectively.
For Tusimple, the evaluation is formulated as follows:
\begin{align}
Accuracy=\frac{\sum{C_{clip}}}{\sum{S_{clip}}}.
Accuracy=\frac{\sum{C_{clip}}}{\sum{S_{clip}}},
\end{align}
where $C_{clip}$ and $S_{clip}$ represent the number of correct points (predicted points within 20 pixels of the ground truth) and the ground truth points, respectively. If the accuracy exceeds 85\%, the prediction is considered correct. TuSimple also reports the \textit{False Positive Rate} ($\mathrm{FPR}=1-\mathrm{Precision}$) and \textit{False Negative Rate} ($\mathrm{FNR}=1-\mathrm{Recall}$) metrics.
@ -370,7 +370,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
\hline
\textbf{Seg \& Grid} \\
\cline{1-1}
SCNN\cite{scnn} &VGG-16 &71.60&39.84&90.60&69.70&58.50&66.90&43.40&84.10&64.40&1900&66.10\\
SCNN\cite{scnn} &VGG-16 &71.60&39.84&90.60&69.70&58.50&66.90&43.40&84.10&64.40&1990&66.10\\
RESA\cite{resa} &ResNet50 &75.30&53.39&92.10&73.10&69.20&72.80&47.70&83.30&70.30&1503&69.90\\
LaneAF\cite{laneaf} &DLA34 &77.41&- &91.80&75.61&71.78&79.12&51.38&86.88&72.70&1360&73.03\\
UFLDv2\cite{ufldv2} &ResNet34 &76.0 &- &92.5 &74.8 &65.5 &75.5 &49.2 &88.8 &70.1 &1910&70.8 \\
@ -378,7 +378,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
\cline{1-1}
\textbf{Parameter} \\
\cline{1-1}
BézierLaneNet\cite{bezierlanenet} &ResNet18&73.67&-&90.22&71.55&62.49&70.91&45.30&84.09&58.98&\textbf{996} &68.70\\
BézierLaneNet\cite{bezierlanenet} &ResNet34&75.57&-&91.59&73.20&69.20&76.74&48.05&87.16 &62.45&\textbf{888}&69.90\\
BSNet\cite{bsnet} &DLA34 &80.28&-&93.87&78.92&75.02&82.52&54.84&90.73&74.71&1485&75.59\\
Eigenlanes\cite{eigenlanes} &ResNet50&77.20&-&91.7 &76.0 &69.8 &74.1 &52.2 &87.7 &62.9 &1509&71.8 \\
\cline{1-1}
@ -392,8 +392,8 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
\cline{1-1}
LaneATT\cite{laneatt} &ResNet18 &75.13&51.29&91.17&72.71&65.82&68.03&49.13&87.82&63.75&1020&68.58\\
LaneATT\cite{laneatt} &ResNet122&77.02&57.50&91.74&76.16&69.47&76.31&50.46&86.29&64.05&1264&70.81\\
CLRNet\cite{laneatt} &Resnet18 &79.58&62.21&93.30&78.33&73.71&79.66&53.14&90.25&71.56&1321&75.11\\
CLRNet\cite{laneatt} &DLA34 &80.47&62.78&93.73&79.59&75.30&82.51&54.58&90.62&74.13&1155&75.37\\
CLRNet\cite{clrnet} &Resnet18 &79.58&62.21&93.30&78.33&73.71&79.66&53.14&90.25&71.56&1321&75.11\\
CLRNet\cite{clrnet} &DLA34 &80.47&62.78&93.73&79.59&75.30&82.51&54.58&90.62&74.13&1155&75.37\\
CLRerNet\cite{clrernet} &DLA34 &81.12&64.07&94.02&80.20&74.41&\textbf{83.71}&56.27&90.39&74.67&1161&\textbf{76.53}\\
\cline{1-1}
\textbf{Sparse Anchor} \\
@ -520,7 +520,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
\end{table}
\subsection{Comparison with the state-of-the-art method}
The comparison results of our proposed model with other methods are shown in Tables \ref{culane result}, \ref{tusimple result}, \ref{llamas result}, \ref{dlrail result}, and \ref{curvelanes result}. We present results for two versions of our model: the NMS-based version, denoted as Polar R-CNN-NMS, and the NMS-free version, denoted as Polar R-CNN. The NMS-based version utilizes predictions $\left\{s_i^g\right\}$ obtained from the O2M head followed by NMS post-processing, while the NMS-free version derives predictions via dual confidence selection.
The comparison results of our proposed model with other methods are shown in Tables \ref{culane result}, \ref{tusimple result}, \ref{llamas result}, \ref{dlrail result}, and \ref{curvelanes result}. We present results for two versions of our model: the NMS-based version, denoted as \textit{Polar R-CNN-NMS}, and the NMS-free version, denoted as \textit{Polar R-CNN}. The NMS-based version utilizes predictions $\left\{s_i^g\right\}$ obtained from the O2M head followed by NMS post-processing, while the NMS-free version derives predictions via dual confidence selection.
To ensure a fair comparison, we also include results for CLRerNet \cite{clrernet} on the CULane and CurveLanes datasets, as we use a similar training strategy and dataset splits. As illustrated in the comparison results, our model demonstrates competitive performance across five datasets. Specifically, on the CULane, TuSimple, LLAMAS, and DL-Rail datasets of sparse scenarios, our model outperforms other anchor-based methods. Additionally, the performance of the NMS-free version is nearly identical to that of the NMS-based version, highlighting the effectiveness of the O2O classification subhead in eliminating redundant predictions in sparse scenarios. On the CurveLanes dataset, the NMS-free version achieves superior F1-measure and Recall compared to other methods.
@ -683,11 +683,11 @@ We also explore the stop-gradient strategy for the O2O classification subhead. A
\multicolumn{2}{c|}{\textbf{Paradigm}} & \textbf{F1@50 (\%)} & \textbf{Precision (\%)} & \textbf{Recall (\%)} \\
\midrule
\multirow{2}*{Baseline}
&O2M-B w/~ NMS &78.83&88.99&70.75\\
&O2M w/~ NMS &78.83&88.99&70.75\\
&O2O-G w/o NMS &71.68\textcolor{darkgreen}{~(7.15$\downarrow$)}&72.56\textcolor{darkgreen}{~(16.43$\downarrow$)}&70.81\textcolor{red}{~(0.06$\uparrow$)}\\
\midrule
\multirow{2}*{Stop Grad}
&O2M-B w/~ NMS &80.81&88.53&74.33\\
&O2M w/~ NMS &80.81&88.53&74.33\\
&O2O-G w/o NMS &80.81\textcolor{red}{~(0.00$\uparrow$)}&88.52\textcolor{darkgreen}{~(0.01$\downarrow$)}&74.33\textcolor{red}{~(0.00$\uparrow$)} \\
\bottomrule
\end{tabular}
@ -710,7 +710,7 @@ We also explore the stop-gradient strategy for the O2O classification subhead. A
& 30 &86.26&90.44&82.45\\
& 25 &86.38&90.27&82.83\\
& 20 &86.57&90.05&83.37\\
& 15 (optimal) &86.81&89.64&84.16\\
& 15 (optimal) &\textbf{86.81}&89.64&84.16\\
& 10 &86.58&88.62&\textbf{84.64}\\
\midrule
Polar R-CNN & - &\textbf{87.29}&90.50&84.31\\
@ -827,7 +827,7 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
\caption{Fast NMS with Geometric Prior.}
\begin{algorithmic}[1] %这个1 表示每一行都显示数字
\REQUIRE ~~\\ %算法的输入参数Input
The index of all anchors, $1, 2, ..., i, ..., K$;\\
The index of all anchors, $1, 2, \ldots, i, \ldots, K$;\\
The positive corresponding anchors, $\left\{ \theta _i,r_{i}^{g} \right\} |_{i=1}^{K}$;\\
The confidence emanating from the O2M classification subhead, $s_i^g$;\\
The regressions emanating from the O2M regression subhead, denoted as $\left\{ Lane_i \right\} |_{i=1}^{K}$\\
@ -836,16 +836,16 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
\STATE Calculate the confidence-prior adjacency matrix $\boldsymbol{A}^{C}\in\mathbb{R}^{K\times K}$, defined as follows:
\begin{align}
A_{ij}^{C}=\begin{cases}
1, s_i>s_j\,\,or\,\,\left( s_i^g=s_j^g\,\,and\,\,i>j \right)\\
0, others.\\
1,\, \mathrm{if}\,\, s_i>s_j\,\,or\,\,\left( s_i^g=s_j^g\,\,and\,\,i>j \right);\\
0,\, \mathrm{others}.\\
\end{cases}
\label{confidential matrix}
\end{align}
\STATE Calculate the geometric-prior adjacency matrix $\boldsymbol{A}^{G}\in\mathbb{R}^{K\times K}$, which is defined as follows:
\begin{align}
A_{ij}^{G}=\begin{cases}
1, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\tau^r\\
0, others.\\
1,\, \mathrm{if}\,\, \left| \theta _i-\theta _j \right|<\tau^{\theta}\,\,and\,\,\left| r_{i}^{g}-r_{j}^{g} \right|<\tau^r;\\
0,\, \mathrm{others}.\\
\end{cases}
\label{geometric prior matrix}
\end{align}
@ -858,15 +858,15 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
\STATE Define the adjacent matrix $\boldsymbol{A} = \boldsymbol{A}^{C} \odot \boldsymbol{A}^{G}$ and the final confidence $\tilde{s}_i^g$ is calculate as following:
\begin{align}
\tilde{s}_{i}^{g}=\begin{cases}
1, \mathrm{Max}\left(\mathcal{D}(:,j)|\boldsymbol{A}(:,j)=1\right)<\left( \tau ^d \right) ^{-1},\\
0, \mathrm{otherwise}\\
1,\, \mathrm{if}\,\, \mathrm{Max}\left(\mathcal{D}(:,j)|\boldsymbol{A}(:,j)=1\right)<\left( \tau ^d \right) ^{-1};\\
0,\, \mathrm{others},\\
\end{cases}
\label{al_1-4}
\end{align}
where $j=1,2,\cdots,K$ and $\mathrm{Max}(\cdot|\boldsymbol{A}(:,j)=1)$ is a max operator along the $j$-th column of adjacency matrix $\boldsymbol{A}$ with the element $A_{:j}=1$.
\STATE Get the final selection set:
\begin{align}
\varOmega_{nms}^{pos}=\left\{ i|s_{j}^{g}>\tau_{o2m}\,\,and\,\,\tilde{s}_{j}^{g}=1 \right\}
\varOmega_{nms}^{pos}=\left\{ i|\tilde{s}_{j}^{g}=1 \right\} \cap \left\{i|s_{i}^{g}>\tau_{o2m} \right\}.
\label{al_1-5}
\end{align}
@ -875,7 +875,7 @@ We draw inspiration from Fast NMS \cite{yolact} for the design of the O2O classi
\label{Graph Fast NMS}
\end{algorithm}
The new algorithm possesses a distinctly different format from its predecessor\cite{yolact}. We introduce a geometric-prior adjacency matrix characterized by $\boldsymbol{A}^G$, alleviating the suppression relationship between disparate anchors. It is manifestly to demonstrate that, when all elements within $\boldsymbol{A}^{G}$ are all set as $1$ (\textit{i.e.} disregarding geometric priors), Algorithm \ref{Graph Fast NMS} is equivalent to Fast NMS. Building upon our newly proposed sort-free Fast NMS with geometric prior, we design the structure of the one-to-one classification head.
The new algorithm possesses a distinctly different format from its predecessor\cite{yolact}. We introduce a geometric-prior adjacency matrix characterized by $\boldsymbol{A}^G$, alleviating the suppression relationship between disparate anchors. It is manifestly to demonstrate that, when all elements within $\boldsymbol{A}^{G}$ are all set as $1$ (\textit{i.e.}, disregarding geometric priors), Algorithm \ref{Graph Fast NMS} is equivalent to Fast NMS. Building upon our newly proposed sort-free Fast NMS with geometric prior, we design the structure of the one-to-one classification head.
The principal limitations of the NMS lie in two steps, namely the definitions of distance stem from geometry (\textit{i.e.}, Eq. (\ref{al_1-3})) and the threshold $\lambda^{g}$ employed to eliminate redundant predictions (\textit{i.e.}, Eq. (\ref{al_1-4})). For instance, in the scenarios involving double lines, despite the minimal geometric distance between the two lane instances, their semantic divergence is remarkably pronounced. Consequently, we replace the aforementioned two steps with trainable neural networks, allowing them to alleviate the limitation of Fast NMS in a data-driven fashion. The neural network blocks to replace Eq. (\ref{al_1-3}) are Eqs. (\ref{edge_layer_1})-(\ref{edge_layer_3}) in the main text.
@ -884,14 +884,14 @@ We use element-wise max pooling for the tensor, as the repalcement of the max op
The score $\tilde{s}_{i}^{g}$ output by the neural network transitions from a binary score to a continuous soft score ranging from 0 to 1. We introduce a new threshold $\lambda^s_{o2o}$ within the updated criteria of Eq. (\ref{al_1-5}):
\begin{align}
\varOmega_{nms-free}^{pos}=\left\{ j|s_{j}^{g}>\tau_{o2m}\,\,and\,\,\tilde{s}_{j}^{g}>\tau_{o2o}\right\}.
\varOmega_{nms-free}^{pos}=\left\{i|\tilde{s}_{i}^{g}>\tau_{o2o} \right\} \cap \left\{ i|s_{i}^{g}>\tau_{o2m} \right\}.
\end{align}
This criteria is also referred to as the \textit{dual confidence selection} in the main text.
\label{NMS_appendix}
\begin{table*}[htbp]
\centering
\caption{Infos and hyperparameters for five datasets. For the CULane dataset, $*$ denotes the actual number of training samples used to train the model. Labels for some validation/test sets are missing and different splits (\textit{i.e.} validation and test set) are selected for different datasets.}
\caption{Infos and hyperparameters for five datasets. For the CULane dataset, $*$ denotes the actual number of training samples used to train the model. Labels for some validation/test sets are missing and different splits (\textit{i.e.}, validation and test set) are selected for different datasets.}
\begin{adjustbox}{width=\linewidth}
\begin{tabular}{l|l|ccccc}
\toprule
@ -946,7 +946,7 @@ To ensure the IoU between lane instances aligns with the conventions of general
\begin{align}
\Delta x_{i,p}^{d}&=x_{i+1,p}^{d}-x_{i-1,p}^{d},\,\, \Delta y_{i,p}^{d}=y_{i+1,p}^{d}-y_{i-1,p}^{d}, \\
w_{i,p}&=\frac{\sqrt{\left( \Delta x_{i,p}^{d} \right) ^2+\left( \Delta y_{i,p}^{d} \right) ^2}}{\Delta y_{i,p}^{d}}w^b,\\
b_{i,p}^{l}&=x_{i,p}^{d}-w_{i,p},\,\, b_{i,p}^{r}=x_{i,p}^{d}+w_{i,p},\,\, \\
b_{i,p}^{l}&=x_{i,p}^{d}-w_{i,p},\,\, b_{i,p}^{r}=x_{i,p}^{d}+w_{i,p},
\end{align}
where $w^{b}$ is the base semi-width parameter and $w_{i,p}$ is the actual semi-width of $p$-th lane instance. The sets $\left\{ b_{i,p}^{l} \right\} _{i=1}^{N}$ and $\left\{ b_{i,p}^{r} \right\} _{i=1}^{N}$ signify the left and right boundaries of the $p$-th lane instance. Subsequently, we defined inter and union between lane instances:
\begin{align}
@ -984,14 +984,7 @@ Suppose that there exist $K$ predictions and $G$ ground truth. Let $\pi$ denotes
\hat{\pi}=\underset{\pi \in \mathscr{S}_{K,G}}{arg\max}\sum_{q=1}^G{\mathcal{C} _{\pi \left( q \right) ,q}^{o2o}}.
\end{align}
This assignment problem can be solved by Hungarian algorithm \cite{detr}. Finally, $G$ predictions are assigned as positive samples and $K-G$ predictions are assigned as negative samples.
In the one-to-many label assignment, we simply use SimOTA \cite{yolox}, which aligns with previous works \cite{clrernet}. Omitting the detailed process of SimOTA, we only introduce the inputs to it, namely the cost matrix $\boldsymbol{M}^C\in \mathbb{R}^{G\times K}$ and the IoU matrix $\boldsymbol{M}^{IoU}\in \mathbb{R}^{G\times K}$. The elements in the two matrices are defined as $M^C_{qp}=\mathcal{C} _{p,q}^{o2m}$ and $M^{IoU}_{qp}= GIoU_{lane}\left( p,q \right)$ (with $g=0$), respectively. The number of assigned predictions for each ground truth is variable but does not exceed an upper bound $k_{dynamic}$, which is set to $4$ in our experiment. Finally, there are $K_{pos}$ positive samples and $K-K_{pos}$ negative samples, where $K_{pos}$ ranges from $0$ to $Gk_{dynamic}$.
Given the ground truth label generated by the label assignment strategy for each prediction, we can conduct the loss function during phase. As illustrated in Fig. \ref{head_assign}, $\mathcal{L}_{cls}^{o2o}$ and $\mathcal{L}_{rank}$ are for the O2O classification subhead, $\mathcal{L}_{cls}^{o2m}$ is for the O2M classification subhead whereas $\mathcal{L}_{GIOU}$ (with $g=1$), $\mathcal{L}_{end}$ and $\mathcal{L}_{aux}$ for the O2M regression subhead.
\label{assign_appendix}
\begin{figure*}[t]
\begin{figure*}[ht]
\centering
\def\pagewidth{0.49\textwidth}
\def\subwidth{0.47\linewidth}
@ -1123,12 +1116,10 @@ Given the ground truth label generated by the label assignment strategy for each
\end{subfigure}
\vspace{0.5em}
\caption{Visualization of detection outcomes in sparse scenarios of four dataset.}
\caption{Visualization of detection outcomes in sparse scenarios of four datasets.}
\label{vis_sparse}
\end{figure*}
\begin{figure*}[t]
\centering
\def\subwidth{0.24\textwidth}
@ -1227,12 +1218,23 @@ Given the ground truth label generated by the label assignment strategy for each
\caption{Visualization of the detection outcomes in sparse and dense scenarios on the CurveLanes dataset.}
\label{vis_dense}
\end{figure*}
In the one-to-many label assignment, we simply use SimOTA \cite{yolox}, which aligns with previous works \cite{clrernet}. Omitting the detailed process of SimOTA, we only introduce the inputs to it, namely the cost matrix $\boldsymbol{M}^C\in \mathbb{R}^{G\times K}$ and the IoU matrix $\boldsymbol{M}^{IoU}\in \mathbb{R}^{G\times K}$. The elements in the two matrices are defined as $M^C_{qp}=\mathcal{C} _{p,q}^{o2m}$ and $M^{IoU}_{qp}= GIoU_{lane}\left( p,q \right)$ (with $g=0$), respectively. The number of assigned predictions for each ground truth is variable but does not exceed an upper bound $k_{dynamic}$, which is set to $4$ in our experiment. Finally, there are $K_{pos}$ positive samples and $K-K_{pos}$ negative samples, where $K_{pos}$ ranges from $0$ to $Gk_{dynamic}$.
Given the ground truth label generated by the label assignment strategy for each prediction, we can conduct the loss function during phase. As illustrated in Fig. \ref{head_assign}, $\mathcal{L}_{cls}^{o2o}$ and $\mathcal{L}_{rank}$ are for the O2O classification subhead, $\mathcal{L}_{cls}^{o2m}$ is for the O2M classification subhead whereas $\mathcal{L}_{GIOU}$ (with $g=1$), $\mathcal{L}_{end}$ and $\mathcal{L}_{aux}$ for the O2M regression subhead.
\label{assign_appendix}
\section{The Supplement of Implement Detail and Visualization Results.}
Some important implement details for each dataset are shown in Table \ref{dataset_info}. It includes the dataset information we employed to conduct experiments and visualizations, the parameters for data processing as well as hyperparameters of Polar R-CNN.
Fig. \ref{vis_sparse} illustrates the visualization outcomes in sparse scenarios spanning four datasets. The top row depicts the ground truth, while the middle row shows the proposed lane anchors and the bottom row exhibits the predictions generated by Polar-RCNN with NMS-free paradigm. In the top and bottom row, different colors aim to distinguish different lane instances, which do not correspond across the images. From images of the middle row, we can see that LPH of Polar R-CNN effectively proposes anchors that are clustered around the ground truth, providing a robust prior for GPH to achieve the final lane predictions. Moreover, the number of anchors has significantly decreased compared to previous works, making our method faster than other anchor-based methods in theory.
Fig. \ref{vis_dense} shows the visualization outcomes in dense scenarios. The first column displays the ground truth, while the second and the third columns reveal the detection results with NMS paradigm of large (\textit{i.e.} the default threshold NMS@50 with 50 pixels) and small (\textit{i.e.} the optimal threshold NMS@15 with 15 pixels) NMS thresholds, respectively. The final column shows the detection results with NMS-free paradigm. We observe that NMS@50 mistakenly removes some predictions, leading to false negatives, while NMS@15 fails to eliminate some redundant predictions, leading to false positives. This underscores that the trade-off struggles between large and small NMS thresholds. The visualization distinctly demonstrates that distance becomes less effective in dense scenarios. Only the proposed O2O classification subhead, driven by data, can address this issue by capturing semantic distance beyond geometric distance. As shown in the last column of Fig. \ref{vis_dense}, the O2O classification subhead successfully eliminates redundant predictions while preserving dense predictions, despite their minimal geometric distances.
Fig. \ref{vis_dense} shows the visualization outcomes in dense scenarios. The first column displays the ground truth, while the second and the third columns reveal the detection results with NMS paradigm of large (\textit{i.e.}, the default threshold NMS@50 with 50 pixels) and small (\textit{i.e.}, the optimal threshold NMS@15 with 15 pixels) NMS thresholds, respectively. The final column shows the detection results with NMS-free paradigm. We observe that NMS@50 mistakenly removes some predictions, leading to false negatives, while NMS@15 fails to eliminate some redundant predictions, leading to false positives. This underscores that the trade-off struggles between large and small NMS thresholds. The visualization distinctly demonstrates that distance becomes less effective in dense scenarios. Only the proposed O2O classification subhead, driven by data, can address this issue by capturing semantic distance beyond geometric distance. As shown in the last column of Fig. \ref{vis_dense}, the O2O classification subhead successfully eliminates redundant predictions while preserving dense predictions, despite their minimal geometric distances.
\label{vis_appendix}
\end{document}

View File

@ -537,3 +537,14 @@
pages={652--660},
year={2017}
}
@article{vanishing,
title={Vanishing point constrained lane detection with a stereo camera},
author={Su, Yingna and Zhang, Yigong and Lu, Tao and Yang, Jian and Kong, Hui},
journal={IEEE Transactions on Intelligent Transportation Systems},
volume={19},
number={8},
pages={2739--2744},
year={2017},
publisher={IEEE}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 MiB

After

Width:  |  Height:  |  Size: 1.7 MiB

Binary file not shown.