This commit is contained in:
ShqWW 2024-09-17 15:37:47 +08:00
parent 789b1dcdcd
commit e2acc30cda
9 changed files with 138 additions and 125 deletions

216
main.tex
View File

@ -38,35 +38,27 @@
\thanks{X. Cao is with the School of Computer Science and Technology and the Ministry of Education Key Lab for Intelligent Networks and Network Security, Xian Jiaotong University, Xian 710049, China.}
}
%\thanks{Manuscript received April 19, 2021; revised August 16, 2021.}}
% The paper headers
% The paper headers
\markboth{S. Wang \MakeLowercase{\textit{et al.}}: Polar R-CNN:\@ End-to-End Lane Detection with Fewer Anchors}%
{S. Wang \MakeLowercase{\textit{et al.}}: Polar R-CNN:\@ End-to-End Lane Detection with Fewer Anchors}
% \IEEEpubid{0000--0000/00\$00.00~\copyright~2021 IEEE}
% Remember, if you use this you must call \IEEEpubidadjcol in the second
% column for its text to clear the IEEEpubid mark.
\maketitle
\begin{abstract}
Lane detection is a critical and challenging task in autonomous driving, particularly in real-world scenarios where traffic lanes are often slender, lengthy, and partially obscured by other vehicles, complicating detection efforts. Existing anchor-based methods typically rely on prior Lane anchors to extract features and refine lane location and shape. Though achieving high performance, manually setting prior anchors is cumbersome, and ensuring adequate coverage across diverse datasets often requires a large number of dense anchors. Additionally, non-maximum suppression is used to suppress redundant predictions, which complicates real-world deployment and may fail in dense scenarios. In this study, we introduce Polar R-CNN, a NMS-free anchor-based method for lane detection. By incorporating both local and global polar coordinate systems, Polar R-CNN enables flexible anchor proposals and significantly reduces the number of anchors required without compromising performance. Additionally, we introduce a heuristic GNN-based NMS-free head that supports an end-to-end paradigm, making the model more deployment-friendly and enhancing performance in dense scenarios. Our method achieves competitive results on five popular lane detection benchmarks—Tusimple, CULane, LLAMAS, CurveLanes, and DL-Rail—while maintaining a lightweight design and straightforward structure. Our source code are available at \href{https://github.com/ShqWW/PolarRCNN}{\textit{https://github.com/ShqWW/PolarRCNN}}.
Lane detection is a critical and challenging task in autonomous driving, particularly in real-world scenarios where traffic lanes can be slender, lengthy, and often obscured by other vehicles, complicating detection efforts. Existing anchor-based methods typically rely on prior Lane anchors to extract features and refine location and shape of lanes. While these methods achieve high performance, manually setting prior anchors is cumbersome, and ensuring sufficient coverage across diverse datasets often requires a large number of dense anchors. Furthermore,
the use of \textit{Non-Maximum Suppression} (NMS) to eliminate redundant predictions complicates real-world deployment and may underperform in complex scenarios. In this paper, we propose \textit{Polar R-CNN}, a NMS-free anchor-based method for lane detection. By incorporating both local and global polar coordinate systems, Polar R-CNN facilitates flexible anchor proposals and significantly reduces the number of anchors required without compromising performance. Additionally, we introduce a heuristic \textit{Graph Neural Network} (GNN)-based NMS-free head that supports an end-to-end paradigm, enhancing deployment efficiency and performance in scenarios with dense lanes. Our method achieves competitive results on five popular lane detection benchmarks—\textit{Tusimple}, \textit{CULane}, \textit{LLAMAS}, \textit{CurveLanes}, and \textit{DL-Rail}—while maintaining a lightweight design and straightforward structure. Our source code is available at \href{https://github.com/ShqWW/PolarRCNN}{\textit{https://github.com/ShqWW/PolarRCNN}}.
\end{abstract}
\begin{IEEEkeywords}
Lane detection, NMS-free, Graph neural network, Polar coordinate system.
Lane Detection, NMS-Free, Graph Neural Network, Polar Coordinate System.
\end{IEEEkeywords}
\section{Introduction}
\IEEEPARstart{L}{ane} detection is a significant problem in computer vision and autonomous driving, forming the basis for accurately perceiving the driving environment in intelligent driving systems. While extensive research has been conducted in ideal environments, it remains a challenging task in adverse scenarios such as night driving, glare, crowd, and rainy conditions, where lanes may be occluded or damaged. Moreover, the slender shapes, complex topologies of lanes and the global property add to the complexity of detection challenges. An effective lane detection method should take into account both global high-level semantic features and local low-level features to address these varied conditions and ensure robust performance in real-time applications such as autonomous driving.
Traditional methods predominantly concentrate on handcrafted local feature extraction and lane shape modeling. Techniques such as the Canny edge detector\cite{cannyedge}, Hough transform\cite{houghtransform}, and deformable templates for lane fitting\cite{kluge1995deformable} have been extensively utilized. Nevertheless, these approaches often encounter limitations in practical settings, particularly when low-level and local features lack clarity and distinctiveness.
In recent years, fueled by advancements in deep learning and the availability of large datasets, significant strides have been made in lane detection. Deep models, including convolutional neural networks (CNNs) and transformer-based architectures, have propelled progress in this domain. Previous approaches often treated lane detection as a segmentation task, which, despite its simplicity, involved time-consuming computations. Some methods relied on parameter-based models, directly outputting lane curve parameters instead of pixel locations. These models offer end-to-end solutions, but the curve parameter sensitivity to lane shape compromises robustness.
\IEEEPARstart{L}{ane} detection is a critical task in computer vision and autonomous driving, aimed at identifying and tracking lane markings on the road. While extensive research has been conducted in ideal environments, it is still challenging in adverse scenarios such as night driving, glare, crowd, and rainy conditions, where lanes may be occluded or damaged. Moreover, the slender shapes and complex topologies of lanes further complicate detection efforts. %Therefore, an effective lane detection method should take into account both global high-level semantic features and local low-level features to address these varied conditions and ensure robust performances in a real-time application. along with their global properties,
\par
In the past few decades, a lot of methods primarily focus on handcrafted local feature extraction and lane shape modeling. Techniques such as the \textit{Canny edge detector}\cite{cannyedge},\textit{ Hough transform}\cite{houghtransform}, and \textit{deformable templates}\cite{kluge1995deformable} have been widely employed for lane fitting. However, these approaches often face limitations in real-world scenarios, especially when low-level and local features lack clarity and distinctiveness.
\par
In recent years, advancements in deep learning and the availability of large datasets have led to significant progress in lane detection, especially deep models such as \textit{Convolutional Neural Networks} (CNNs)\cite{scnn} and \textit{transformer-based} architectures \cite{lstr}. Based on this, earlier approaches typically framed lane detection as a \textit{segmentation task} \cite{lanenet}, which, despite its straightforward, required time-consuming computations. There are still some methods that rely on \textit{parameter-based} models, which directly output lane curve parameters rather than pixel locations \cite{bezierlanenet}\cite{polylanenet}\cite{lstr}. Although these segmentation-based and parameter-based methods provide end-to-end solutions, their sensitivity to lane shape compromises their robustness.
\begin{figure}[t]
\centering
\def\subwidth{0.24\textwidth}
@ -90,7 +82,7 @@ In recent years, fueled by advancements in deep learning and the availability of
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/anchor_demo/gt.jpg}
\caption{}
\end{subfigure}
\caption{Anchor settings of different methods. (a) The initial anchor settings of CLRNet. (b) The learned anchor settings of CLRNet trained on CULane. (c) The proposed anchors of our method. (d) The ground truth.}
\caption{Anchor settings of different methods. (a) The initial anchor settings of CLRNet. (b) The learned anchor settings of CLRNet trained on CULane. (c) The learned anchors of our method. (d) The ground truth.}
\label{anchor setting}
\end{figure}
@ -106,12 +98,11 @@ In recent years, fueled by advancements in deep learning and the availability of
\caption{}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/nms_demo/less_pred.jpg}
\caption{}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/nms_demo/redun_gt.jpg}
\caption{}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/nms_demo/redun_gt.jpg}
\includegraphics[width=\imgwidth, height=\imgheight]{thesis_figure/nms_demo/less_pred.jpg}
\caption{}
\end{subfigure}
\begin{subfigure}{\subwidth}
@ -119,118 +110,93 @@ In recent years, fueled by advancements in deep learning and the availability of
\caption{}
\end{subfigure}
\caption{Comparison of different anchor thresholds in sparse and dense scenarios. (a) Ground truth in a dense scenario, where some lane instances are close with each others. (b) Predictions with large NMS thresholds in a dense scenario, where a lane prediction is mistakenly suppressed. (c) Ground truth in a sparse scenario, where the lane instance are far apart. (d) Predictions with small NMS threshold in a sparse scenario, where redundant prediction results fail to be removed.}
\caption{Comparison of anchor thresholds in \textit{sparse} and \textit{dense} scenarios. (a) and (b) Ground truths in a dense and sparse scenarios, respectively. (c) Predictions with large NMS thresholds in a dense scenario, resulting in a lane prediction being mistakenly suppressed. (d) Predictions with a small NMS threshold in a sparse scenario, where redundant prediction results are not effectively removed.}
\label{NMS setting}
\end{figure}
Drawing inspiration from object detection methods such as Yolos \cite{yolov10} and Faster R-CNN \cite{fasterrcnn}, several anchor-based approaches have been introduced for lane detection, the representative works including LaneATT \cite{laneatt} and CLRNet \cite{clrnet}. These methods have demonstrated superior performance by leveraging anchor priors and enabling larger receptive fields for feature extraction. However, anchor-based methods encounter similar drawbacks as anchor-based general object detection method as follows:
(1) A large number of lane anchors are placed throughout the image, even in sparse scenarios. Sparse scenarios refer to situations where lanes are distributed sparsely and are located far from each other, as illustrated in Fig. \ref{anchor setting}(d).
(2) Non-maximum suppression (NMS) post-processing is required to remove redundant predictions but may struggle in dense scenarios. Dense scenarios involve situations where lanes are close to each other, such as forked lanes and double lanes, as depicted in Fig. \ref{NMS setting}(a).
Regrading the first issue, \cite{clrnet} introduced learned anchors, where the anchor parameters are optimized during training to adapt to the lane distributions (see Fig. \ref{anchor setting}(b)) in real dataset. Additionally, they employ cascade cross-layer anchor refinement to bring the anchors closer to the ground truth. However, the anchors are still numerous to cover the potential distributions of lanes. Moving further, \cite{adnet} proposes flexible anchors for each image by generating start points, rather than using a fixed set of anchors for all images. Nevertheless, the start points of lanes are subjective and lack clear visual evidence due to the global nature of lanes, which affects its performance. \cite{srlane} uses a local angle map to propose sketch anchors according to the direction of ground truth. This approach only considers the direction and neglects the accurate positioning of anchors, resulting in suboptimal performance without cascade anchor refinement. Overall, numerous anchors are unnecessary in sparse scenarios (where lane ground truths are sparse). The trend in newly proposed methods is to reduce the number of anchors and offer more flexible anchor configurations.
Regarding the second issue, nearly all anchor-based methods (including those mentioned above) require direct or indirect NMS post-processing to eliminate redundant predictions. Although it is necessary to eliminate redundant predictions, NMS remains a suboptimal solution. On the one hand, NMS is not deployment-friendly because it involves defining and calculating distances (e.g., Intersection over Union) between lane pairs. This is more challenging than bounding boxes in general object detection due to the complexity of lane geometry. On the other hand, NMS fails in some dense scenarios where the lane ground truths are closer together compared to sparse scenarios. A large distance threshold may result in false negatives, as some true positive predictions might be eliminated (as shown in Fig. \ref{NMS setting}(a)\&(b)) by mistake. Conversely, a small distance threshold may not eliminate redundant predictions effectively and can leave false positives (as shown in Fig. \ref{NMS setting}(c)\&(d)). Achieving an optimal trade-off in all scenarios by manually setting the distance threshold is challenging. The root cause of this problem is that the distance definition in NMS considers only geometric parameters while ignoring the semantic context in the image. Thus, when two predictions are “close” to each other, it is nearly impossible to determine whether one of them is redundant.
To address the two issues outlined above, we propose Polar R-CNN, a novel anchor-based method for lane detection. For the first issue, we introduce local and global heads based on the polar coordinate system to create anchors with more accurate locations and reduce the number of proposed anchors in sparse scenarios, as illustrated in Fig. \ref{anchor setting}(c). Compared to state-of-the-art previous work \cite{clrnet}\cite{clrernet} which uses 192 anchors, Polar R-CNN employs only 20 anchors to cover potential lane ground truths. For the second issue, we have revised Fast NMS to Graph-based Fast NMS and introduced a new heuristic graph neural network block (Polar GNN block) integrated into the NMS head. The Polar GNN block offers an interpretable structure, achieving nearly equivalent performance in sparse scenarios and superior performance in dense scenarios. We conducted experiments on five major benchmarks: TuSimple \cite{tusimple}, CULane \cite{scnn}, LLAMAS \cite{llamas}, CurveLanes \cite{curvelanes}, and DL-Rail \cite{dalnet}. Our proposed method demonstrates competitive performance compared to state-of-the-art methods.
Our main contributions are summarized as follows:
%, where some lane instances are close with each others; , where the lane instance are far apart
\par
Drawing inspiration from object detection methods such as \textit{YOLO} \cite{yolov10} and \textit{Faster R-CNN} \cite{fasterrcnn}, several anchor-based approaches have been introduced for lane detection, with representative works including \textit{LaneATT} \cite{laneatt} and \textit{CLRNet} \cite{clrnet}. These methods have shown superior performance by leveraging anchor \textit{priors} (as shown in Fig. \ref{anchor setting}) and enabling larger receptive fields for feature extraction. However, anchor-based methods encounter similar drawbacks to those in general object detection, including the following:
\begin{itemize}
\item We simplified the anchor parameters using local and global polar coordinate systems and applied them to two-stage lane detection frameworks. Compared to other anchor-based methods, the number of proposed anchors is greatly reduced while achieving better performance.
\item We introduced a novel heuristic Polar GNN block to implement a NMS-free paradigm. The GNN architecture is designed with reference to Graph-based Fast NMS, providing interpretability. Our model supports end-to-end training and testing, but traditional NMS post-processing can still be used as an option for a NMS version of our model.
\item Our method utilizes two-stage architectures and achieves competitive performance compared to state-of-the-art methods across five datasets. The high performance with fewer anchors and a NMS-free paradigm demonstrates the effectiveness of our approach. Additionally, our model is designed with a straightforward structure (without cascade refinement or attention strategies), which simplifies deployment.
\item As shown in Fig. \ref{anchor setting}(a), a large number of lane anchors are predefined in the image, even in \textbf{\textit{sparse scenarios}}---the situations where lanes are distributed widely and located far apart from each other, as illustrated in the Fig. \ref{anchor setting}(d).
\item A \textit{Non-Maximum Suppression} (NMS) post-processing step is required to eliminate redundant predictions but may struggle in \textbf{\textit{dense scenarios}} where lanes are close to each other, such as forked lanes and double lanes, as illustrated in the Fig. \ref{NMS setting}(a).
\end{itemize}
\section{Related Works}
The lane detection aims to detect lane instances in an image. In this section, we only introduce deep-leanrning based methods for lane detection. The lane detection methods can be categorized by segmentation based, parameter-based methods and anchor-based methods.
\textbf{Segmentation-based Methods.} Segmentation-based methods focus on pixel-wise prediction. They predefined each pixel into different categories according to different lane instances and background\cite{lanenet} and predicted information pixel by pixel. However, these methods overly focus on low-level and local features, neglecting global semantic information and real-time detection. SCNN uses a larger receptive field to overcome this problem. Some methods such as UFLDv1 and v2\cite{ufld}\cite{ufldv2} and CondLaneNet\cite{CondLaneNet} utilize row-wise or column-wise classification instead of pixel classification to improve detection speed. Another issue with these methods is that the lane instance prior is learned by the model itself, leading to a lack of prior knowledge. Lanenet uses post-clustering to distinguish each lane instance. UFLD divides lane instances by angles and locations and can only detect a fixed number of lanes. CondLaneNet utilizes different conditional dynamic kernels to predict different lane instances. Some methods such as FOLOLane\cite{fololane} and GANet\cite{ganet} use bottom-up strategies to detect a few key points and model their global relations to form lane instances.
\textbf{Parameter-based Methods.} Instead of predicting a series of points locations or pixel classes, parameter-based methods directly generate the curve parameters of lane instances. PolyLanenet\cite{polylanenet} and LSTR\cite{lstr} consider the lane instance as a polynomial curve and output the polynomial coefficients directly. BézierLaneNet\cite{bezierlanenet} treats the lane instance as a Bézier curve and generates the locations of control points of the curve. BSLane uses B-Spline to describe the lane, and the curve parameters focus on the local shapes of lanes. Parameter-based methods are mostly end-to-end without post-processing, which grants them faster speed. However, since the final visual lane shapes are sensitive to the lane shape, the robustness and generalization of parameter-based methods may be less than ideal.
\textbf{Anchor-Based Methods.} Inspired by general object detection methods like YOLO \cite{yolov10} and Faster R-CNN \cite{fasterrcnn}, anchor-based approaches have been proposed for lane detection. Line-CNN is, to our knowledge, the earliest method that utilizes line anchors for detecting lanes. These lines are designed as rays emitted from the three edges (left, bottom, and right) of an image. However, the models receptive field is limited to the edges, which makes it suboptimal for capturing the entire lane. LaneATT \cite{laneatt} improves upon this by employing anchor-based feature pooling to aggregate features along the entire line anchor, achieving faster speeds and better performance. Nevertheless, its grid sampling strategy and label assignment pose limitations. CLRNet \cite{clrnet} enhances anchor-based performance with cross-layer refinement strategies, SimOTA label assignment \cite{yolox}, and Liou loss, surpassing many previous methods. A key advantage of anchor-based methods is their adaptability, allowing the integration of strategies from anchor-based general object detection, such as label assignment, bounding box refinement, and GIOU loss. However, existing anchor-based lane detection methods also have notable drawbacks. Lane anchors are often handcrafted and numerous, which can be cumbersome. Some approaches, such as ADNet \cite{adnet}, SRLane \cite{srlane}, and Sparse Laneformer \cite{sparse}, attempt to reduce the number of anchors and provide flexible proposals, but this can slightly impact performance. Additionally, methods such as \cite{clrernet} \cite{adnet} still rely on NMS post-processing, complicating NMS threshold settings and model deployment. Although one-to-one label assignment (during training) without NMS \cite{detr}\cite{o2o} (during evaluation) alleviates this issue, its performance remains less satisfactory compared to NMS-based models.
\par
Regrading the first issue, \cite{clrnet} introduced learned anchors that optimize the anchor parameters during training to better adapt to lane distributions, as shown in Fig. \ref{anchor setting}(b). However, the number of anchors remains excessive to adequately cover the diverse potential distributions of lanes. Furthermore, \cite{adnet} proposes flexible anchors for each image by generating start points, rather than using a fixed set of anchors. Nevertheless, these start points of lanes are subjective and lack clear visual evidence due to the global nature of lanes. In contrast, \cite{srlane} uses a local angle map to propose sketch anchors according to the direction of ground truth. While this approach considers directional alignment, it neglects precise anchor positioning, resulting in suboptimal performance. Overall, the abundance of anchors is unnecessary in sparse scenarios.% where lane ground truths are sparse. The trend in new methodologies is to reduce the number of anchors while offering more flexible anchor configurations.%, which negatively impacts its performance. They also employ cascade cross-layer anchor refinement to bring the anchors closer to the ground truth. in the absence of cascade anchor refinement
\par
Regarding the second issue, nearly all anchor-based methods \cite{laneatt}\cite{clrnet}\cite{adnet}\cite{srlane} rely on direct or indirect NMS post-processing to eliminate redundant predictions. Although it is necessary to eliminate redundant predictions, NMS remains a suboptimal solution. On one hand, NMS is not deployment-friendly because it requires defining and calculating distances between lane pairs using metrics such as \textit{Intersection over Union} (IoU). This task is more challenging than in general object detection due to the intricate geometry of lanes. On the other hand, NMS can struggle in dense scenarios. Typically, a large distance threshold may lead to false negatives, as some true positive predictions could be mistakenly eliminated, as illustrated in Fig. \ref{NMS setting}(a)(c). Conversely, a small distance threshold may fail to eliminate redundant predictions effectively, resulting in false positives, as shown in Fig. \ref{NMS setting}(b)(d). Therefore, achieving an optimal trade-off across all scenarios by manually setting the distance threshold is challenging. %The root of this problem lies in the fact that the distance definition in NMS considers only geometric parameters while ignoring the semantic context in the image. As a result, when two predictions are ``close'' to each other, it is nearly impossible to determine whether one of them is redundant.% where lane ground truths are closer together than in sparse scenarios;including those mentioned above,
\par
To address the above two issues, we propose Polar R-CNN, a novel anchor-based method for lane detection. For the first issue, we introduce local and global heads based on the polar coordinate system to create anchors with more accurate locations, thereby reducing the number of proposed anchors in sparse scenarios, as illustrated in Fig. \ref{anchor setting}(c). In contrast to \textit{State-Of-The-Art} (SOTA) methods \cite{clrnet}\cite{clrernet}, which utilize 192 anchors, Polar R-CNN employs only 20 anchors to effectively cover potential lane ground truths. For the second issue, we have revised Fast NMS to Graph-based Fast NMS, incorporating a new heuristic \textit{Graph Neural Network} (GNN) block (Polar GNN block) into the NMS head. The Polar GNN block offers an interpretable structure, achieving nearly equivalent performance in sparse scenarios and superior performance in dense scenarios. We conducted experiments on five major benchmarks: \textit{TuSimple} \cite{tusimple}, \textit{CULane} \cite{scnn}, \textit{LLAMAS} \cite{llamas}, \textit{CurveLanes} \cite{curvelanes}, and \textit{DL-Rail} \cite{dalnet}. Our proposed method demonstrates competitive performance compared to SOTA approaches. Our main contributions are summarized as follows:
\begin{itemize}
\item We design a strategy to simplify the anchor parameters by using local and global polar coordinate systems and applied these to two-stage lane detection frameworks. Compared to other anchor-based methods, this strategy significantly reduces the number of proposed anchors while achieving better performance.
\item We propose a novel Polar GNN block to implement a NMS-free paradigm. The block is inspired by Graph-based Fast NMS, providing enhanced interpretability. Our Polar GNN block supports end-to-end training and testing while still allowing for traditional NMS post-processing as an option for a NMS version of our model.
\item By integrating the polar coordinate systems and Polar GNN block, we present a Polar R-CNN model for fast and efficient lane detection. And we conduct extensive experiments on five benchmark datasets to demonstrate the effectiveness of our model in high performance with fewer anchors and a NMS-free paradigm. %Additionally, our model features a straightforward structure—lacking cascade refinement or attention strategies—making it simpler to deploy.
\end{itemize}
%
\begin{figure*}[ht]
\centering
\includegraphics[width=\linewidth]{thesis_figure/ovarall_architecture.png} % 替换为你的图片文件名
\caption{The overall pipeline of Polar R-CNN. The architecture is simple and lightweight. The local polar head proposes sparse line anchors, and after pooling features along these anchors, the global polar head produces the final predictions. The global polar head includes a triplet head, which comprises a one-to-one classification head (O2O cls head), a one-to-many classification head (O2M cls head), and a one-to-many regression head (O2M reg head). The O2O classification head replaces NMS post-processing by selecting a single positive prediction sample for each ground truth from the redundant predictions generated by the O2M head.}
\label{overall_architecture}
\centering
\includegraphics[width=0.85\linewidth]{thesis_figure/ovarall_architecture.png}
\caption{An illustration of the Polar R-CNN architecture. It has a similar pipelines with the Faster R-CNN for the task of object detection, and consists of a backbone, a FPN with three levels of feature maps, respectively denote by $P_0, P_1, P_2$, followed by a \textit{local polar head}, and a RoI pooling module to extract features fed to a \textit{global polar head} for lane detection. Based on the designed lane representation and lane anchor representation in polar coordinate system, the local polar head can propose sparse line anchors and the global polar head can produce the robust and accurate lane predictions. The global polar head includes a triplet head, which comprises a \textit{one-to-one classification} (O2O Cls) head, a \textit{one-to-many classification} (O2M Cls), and a \textit{one-to-many regression} (O2M reg) head.}
\label{overall_architecture}
\end{figure*}
\textbf{NMS-Free Object Detections.}. Non-maximum suppression (NMS) is an important post-processing step in most general object detection methods. Detr \cite{detr} employs one-to-one label assignment to avoid redundant predictions without using NMS. Other NMS-free methods \cite{learnNMS} have also been proposed, addressing this issue from two aspects: model architecture and label assignment. Studies \cite{date} \cite{yolov10} suggest that one-to-one assignments are crucial for NMS-free predictions, but maintaining one-to-many assignments is still necessary to ensure effective feature learning of the model. Other works \cite{o3d} \cite{relationnet} consider the models expressive capacity to provide non-redundant predictions. However, few studies have analyzed the NMS-free paradigm for anchor-based lane detection methods as thoroughly as in general object detection. Most anchor-based lane detection methods still rely on NMS post-processing. In our work, besides label assignment, we extend the analysis to the detection heads structure, focusing on achieving non-redundant (NMS-free) lane predictions.
In this work, we aim to address to two issues in anchor-based lane detection mentioned above, the sparse lane anchor setting and NMS-free predictions.
\section{Proposed method}
The overall architecture of Polar R-CNN is illustrated in Fig. \ref{overall_architecture}. Our model adheres to the Faster R-CNN \cite{fasterrcnn} framework, consisting of a backbone, Feature Pyramid Network (FPN), Region Proposal Network (RPN), and Region of Interest (RoI) pooling. To investigate the fundamental factors affecting model performance, such as anchor settings and NMS post-processing, and make the model easier to deploy, Polar R-CNN employs a simple and straightforward network structure. It relies on basic components including convolutional layers, Multi-Layer Perceptrons (MLPs), and pooling operations, deliberately excluding advanced elements like attention mechanisms, dynamic kernels, and cross-layer refinement used in pervious works \cite{clrnet}\cite{clrernet}.
% \begin{table}[h]
% \centering
% \caption{Notations of some important variable}
% \begin{adjustbox}{width=\linewidth}
% \begin{tabular}{lll}
% \toprule
% \textbf{Variable} & \textbf{Type} & \hspace{10em}\textbf{Defination} \\
% \midrule
% $\mathbf{P}_{i}$ & tensor& The $i_{th}$ output feature map from FPN\\
% $H^{l}$& scalar& The height of the local polar map\\
% $W^{l}$& scalar& The weight of the local polar map\\
% $K_{a}$ & scalar& The number of anchors selected during evaluation\\
% $\mathbf{c}^{g}$& tensor& The origin point of global polar coordinate\\
% $\mathbf{c}^{l}$& tensor& The origin point of local polar coordinate\\
% $r^{g}_{i}$& scalar& The $i_{th}$ anchor radius under global polar coordinate\\
% $r^{l}_{i}$& scalar& The $i_{th}$ anchor radius under global polar coordinate\\
% $\theta_{i}$& scalar& The $i_{th}$ anchor angle under global/local polar coordinate\\
% \midrule
% $\mathbf{X}^{pool}_{i}$& tensor& The pooling feature of the $i_{th}$ anchor\\
% $N^{nbr}_{i}$& set& The adjacent node set of the $i_{th}$ of anchor node\\
% $C_{o2m}$ & scalar& The positive threshold of one-to-many confidence\\
% $C_{o2o}$ & scalar& The positive threshold of one-to-one confidence\\
% $d_{dim}$ & scalar& Dimension of the distance tensor.\\
% $w_{b}$ & scalar& Base width of the lane instance.\\
% % \midrule
% % & & \\
% % & & \\
% % & & \\
% % & & \\
% % & & \\
% \bottomrule
% \end{tabular}
% \end{adjustbox}
% \end{table}
\subsection{Lane and Lane Anchor Representation}
Lanes are characterized by their thin and elongated curved shapes. A suitable lane prior aids the model in extracting features, predicting locations, and modeling the shapes of lane curves with greater accuracy. Consistent with previous studies \cite{linecnn}\cite{laneatt}, our lane priors (also referred to as lane anchors) consists of straight lines. We sample a sequence of 2D points along each lane anchor, denoted as $ P\doteq \left\{ \left( x_1, y_1 \right) , \left( x_2, y_2 \right) , ....,\left( x_n, y_n \right) \right\} $, where N is the number of sampled points. The y-coordinates of these points are uniformly sampled from the vertical axis of the image, specifically $y_i=\frac{H}{N-1}*i$, where H is the image height. These y-coordinates are also sampled from the ground truth lane, and the model is tasked with regressing the x-coordinate offset from the lane anchor to the lane instance ground truth. The primary distinction between Polar R-CNN and previous approaches lies in the description of the lane anchors, which will be detailed in the following sections.
%
\section{Related Works}
%As mentioned above, our model is based on deep learning.
Generally, deep learning-based lane detection methods can be categorized into three groups: segmentation-based, parameter-based, and anchor-based methods. Additionally, NMS-free is an important technique for anchor-based methods, and it will also be described in this section.
\par
\textbf{Segmentation-based Methods.} These methods focus on pixel-wise prediction. They predefined each pixel into different categories according to different lane instances and background\cite{lanenet} and predicted information pixel by pixel. However, they often overly emphasize low-level and local features, neglecting global semantic information and real-time detection. To address this issue, \textit{SCNN} \cite{scnn} uses a larger receptive field. There are some methods such as \textit{UFLDv1-v2} \cite{ufld}\cite{ufldv2} and \textit{CondLaneNet}\cite{CondLaneNet} by utilizing row-wise or column-wise classification instead of pixel classification to improve detection speed. Another issue with these methods is that the lane instance prior is learned by the model itself, leading to a lack of prior knowledge. For example, \textit{LaneNet}\cite{lanenet} uses post-clustering to distinguish each lane instance, while \textit{UFLDv1-v2}
categorizes lane instances by angles and locations, allowing it to detect only a fixed number of lanes. In contrast, \textit{CondLaneNet} employs different conditional dynamic kernels to predict different lane instances. Additionally, some methods such as \textit{FOLOLane}\cite{fololane} and \textit{GANet}\cite{ganet} adopt bottom-up strategies to detect a few key points and model their global relations to form lane instances.
\par
\textbf{Parameter-based Methods.} Instead of predicting a series of points locations or pixel classifications, the parameter-based methods directly generate the curve parameters of lane instances. For example, \textit{PolyLanenet}\cite{polylanenet} and \textit{LSTR}\cite{lstr} consider the lane instance as a polynomial curve, outputting the polynomial coefficients directly. \textit{BézierLaneNet}\cite{bezierlanenet} treats the lane instance as a Bézier curve, generating the locations of their control points, while \textit{BSLane}\cite{bsnet} uses B-Spline to describe the lane, with curve parameters that emphasize local lane shapes. These parameter-based methods are mostly end-to-end and do not require post-processing, resulting in faster inference speed. However, since the final visual lane shapes are sensitive to their shapes, the robustness and generalization of these methods may not be optimal.
\par
\textbf{Anchor-Based Methods.} These methods are inspired by general object detection models, such as YOLO \cite{yolov10} and Faster R-CNN \cite{fasterrcnn}, for lane detection. The earliest work is Line-CNN, which utilizes line anchors designed as rays emitted from the three edges (left, bottom, and right) of an image. However, the models receptive field is limited to the edges, rendering it suboptimal for capturing the entirety of the lane. LaneATT \cite{laneatt} improves upon this by employing anchor-based feature pooling to aggregate features along the entire line anchor, achieving faster speeds and better performance. Nevertheless, its grid sampling strategy and label assignment still pose limitations. A key advantage of the anchor-based methods is their flexibility, allowing the integration of strategies from anchor-based object detection. For example, \textit{CLRNet} \cite{clrnet} enhances the performance with \textit{cross-layer refinement strategies}, \textit{SimOTA label assignment} \cite{yolox}, and \textit{LIOU loss}, outperforming many previous methods. They also have some essential drawbacks, e.g., lane anchors are often handcrafted and numerous. Some approaches, such as \textit{ADNet} \cite{adnet}, \textit{SRLane} \cite{srlane}, and \textit{Sparse Laneformer} \cite{sparse}, attempt to reduce the number of anchors and provide more flexible proposals; however, this can slightly impact performance. Additionally, methods such as \cite{clrernet} \cite{adnet} still rely on NMS post-processing, complicating NMS threshold settings and model deployment. Although one-to-one label assignment during training, without NMS \cite{detr}\cite{o2o} during evaluation, alleviates this issue, its performance is still less satisfactory compared to NMS-based models.
\par
\textbf{NMS-free Methods.} Due to the threshold sensitivity and computational overhead of NMS, many studies attempt to NMF-free methods or models that do not use NMS during the detection process. For example, \textit{DETR} \cite{detr} employs one-to-one label assignment to avoid redundant predictions without using NMS. Other NMS-free methods \cite{learnNMS}\cite{date} \cite{yolov10} have also been proposed to addressing this issue from two aspects: \textit{model architecture} and \textit{label assignment}. For example, studies in \cite{date} \cite{yolov10} suggest that one-to-one assignments are crucial for NMS-free predictions, but maintaining one-to-many assignments is still necessary to ensure effective feature learning of the model. While some works in \cite{o3d} \cite{relationnet} consider the models expressive capacity to provide non-redundant predictions. However, compared to the extensive studies conducted in general object detection, there has been limited research analyzing the NMS-free paradigm.
\par
In this work, we aim to address the above two issues in the framework of anchor-based detection to achieve NMF-free and non-redundant lane predictions.
%
\begin{figure}[t]
\centering
\def\subwidth{0.24\textwidth}
\def\imgwidth{\linewidth}
\def\imgheight{0.4\linewidth}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth]{thesis_figure/coord/ray.png}
\caption{}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth]{thesis_figure/coord/polar.png}
\caption{}
\end{subfigure}
\caption{Different descriptions for anchor parameters: (a) Ray: defined by its start point and orientation. (b) Polar: defined by its radius and angle.}
\label{coord}
\centering
\def\subwidth{0.24\textwidth}
\def\imgwidth{\linewidth}
\def\imgheight{0.4\linewidth}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth]{thesis_figure/coord/ray.png}
\caption{}
\end{subfigure}
\begin{subfigure}{\subwidth}
\includegraphics[width=\imgwidth]{thesis_figure/coord/polar.png}
\caption{}
\end{subfigure}
\caption{Different descriptions for anchor parameters: (a) Ray: defined by its starting point and direction $\theta$. (b) Polar: defined by its radius and angle.} %rectangular coordinates
\label{coord}
\end{figure}
%
\section{Polar R-CNN}
The overall architecture of our Polar R-CNN is illustrated in Fig. \ref{overall_architecture}. As shown in this figure, our Polar R-CNN for lane detection has a similar pipeline with Faster R-CNN \cite{fasterrcnn}, which consists of a backbone, a \textit{Feature Pyramid Network} (FPN), a \textit{Region Proposal Network} (RPN) followed by a local polar head, and \textit{Region of Interest} (RoI) pooling module followed by a global polar head. To investigate the fundamental factors affecting model performance, such as anchor settings and NMS post-processing, and also to enhance ease of deployment, our Polar R-CNN utilizes a simple and straightforward network structure. just relying on basic components, including convolutional or pooling operations, \textit{Multi-Layer Perceptrons} (MLPs), while deliberately excluding advanced elements like \textit{attention mechanisms}, \textit{dynamic kernels}, and \textit{cross-layer refinement} used in previous works \cite{clrnet}\cite{clrernet}.
\par
In the following, based on a polar coordinate representation of lane and lane anchors, we will further introduce the designed \textit{Local Polar Head} (LPH) and \textit{Global Polar Head} (GPH) in our Polar R-CNN.
%
\subsection{Representation of Lane and Lane Anchor}
%
Lanes are characterized by their thin, elongated, and curved shapes. A well-defined lane prior aids the model in feature extraction and location prediction.
\par
\textbf{Lane and Anchor Representation as Ray.} Given an input image with dimensions of length $W$ and height $H$, a lane is represented by a set of 2D points with equally spaced y-coordinates $Y=\{y_1, y_2,\cdots, y_n\}$, where $y_i=i\times\frac{H}{n}$ and $n$ is the number of data points. Since the set $Y$ is fixed, a lane can be uniquely defined by its x-coordinates $X=\{x_1,x_2,\cdots,x_n\}$, with each $x_i$ corresponding to the respective $y_i\in Y$. Previous studies \cite{linecnn}\cite{laneatt} have introduced lane priors, also known as lane anchors, which are represented as straight lines in the image plane that serve as references. From a geometric perspective, a lane anchor can be viewed as a ray defined by a starting point $(x_{orig},y_{orig})$ located at the edge of an image (left/bottom/right boundaries), along with a direction $\theta$, as shown in Fig. \ref{coord}(a). The primary task of a lane detection model is to estimate the x-coordinate offset from the lane anchor to the ground truth of the lane instance. However, ......
\textbf{Polar Coordinate system.} Since lane anchors are typically represented as straight lines, they can be described using straight line parameters. Previous approaches have used rays to describe 2D lane anchors, with the parameters including the coordinates of the starting point and the orientation/angle, denoted as $\left\{\theta, P_{xy}\right\}$, as shown in Fig. \ref{coord}(a). \cite{linecnn}\cite{laneatt} define the start points as lying on the three image boundaries. However, \cite{adnet} argue that this approach is problematic because the actual starting point of a lane could be located anywhere within the image. In our analysis, using a ray can lead to ambiguity in line representation because a line can have an infinite number of starting points, and the choice of the starting point for a lane is subjective. As illustrated in Fig. \ref{coord}(a), the yellow (the visual start point) and green (the point located on the image boundary) starting points with the same orientation $\theta$ describe the same line, and either could be used in different datasets \cite{scnn}\cite{vil100}. This ambiguity arises because a straight line has two degrees of freedom, whereas a ray has three (two for the start point and one for orientation). To resolve this issue , we propose using polar coordinates to describe a lane anchor with only two parameters: radius and angle, deoted as $\left\{\theta, r\right\}$, where $\theta \in \left[-\frac{\pi}{2}, \frac{\pi}{2}\right)$ and $r \in \left(-\infty, +\infty\right)$. This representation isillustrated in Fig. \ref{coord}(b).
\newpage
\par
\textbf{Representation in Polar Coordinate.}
As stated above, lane anchors represented by rays have some drawbacks. To address these issues, we introduce the polar coordinate representation of lane anchors. In mathematics, the polar coordinate is a two-dimensional coordinate system in which each point on a plane is determined by a distance from a reference point (also called the pole) and an angle $\theta$ from a reference direction (called polar axis). As shown in Fig. \ref{coord}(b), a lane anchor for a straight line can be uniquely defined by two parameters: the radial distance from the pole (called radius), $r$, and the counterclockwise angle from the polar axis, $\theta$, with $r\geq 0$ and $\theta\in\left(-\frac{\pi}{2}, \frac{\pi}{2}\right]$.
\newpage
Since lane anchors are typically represented as straight lines, they can be described using straight line parameters. Previous approaches have used rays to describe 2D lane anchors, with the parameters including the coordinates of the starting point and the orientation/angle, denoted as $\left\{\theta, P_{xy}\right\}$, as shown in Fig. \ref{coord}(a). \cite{linecnn}\cite{laneatt} define the start points as lying on the three image boundaries. However, \cite{adnet} argue that this approach is problematic because the actual starting point of a lane could be located anywhere within the image. In our analysis, using a ray can lead to ambiguity in line representation because a line can have an infinite number of starting points, and the choice of the starting point for a lane is subjective. As illustrated in Fig. \ref{coord}(a), the yellow (the visual start point) and green (the point located on the image boundary) starting points with the same orientation $\theta$ describe the same line, and either could be used in different datasets \cite{scnn}\cite{vil100}. This ambiguity arises because a straight line has two degrees of freedom, whereas a ray has three (two for the start point and one for orientation). To resolve this issue , we propose using polar coordinates to describe a lane anchor with only two parameters: radius and angle, deoted as $\left\{\theta, r\right\}$, where This representation is illustrated in Fig. \ref{coord}(b).
\newpage
\begin{figure}[t]
\centering
\includegraphics[width=0.45\textwidth]{thesis_figure/local_polar_head.png}
@ -318,7 +284,7 @@ where $\boldsymbol{w}_{L}^{s}\in \mathbb{R} ^{N_p}$ represents the learnable agg
\\
&f_{cls}^{plain}\left( \boldsymbol{F}_{i}^{roi} \right) \rightarrow 1,
\\
&f_{cls}^{plain}\left( \boldsymbol{F}_{i}^{roi} \right) \rightarrow 0.
&f_{cls}^{plain}\left( \boldsymbol{F}_{j}^{roi} \right) \rightarrow 0.
\end{aligned}
\label{sharp fun}
\end{equation}
@ -1228,7 +1194,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
\end{subfigure}
\vspace{0.5em}
\caption{The visualization of the detection results of sparse scenarios.}
\caption{The visualization of the detection results of sparse\&dense scenarios on CurveLanes dataset.}
\label{vis_dense}
\end{figure*}

View File

@ -441,3 +441,50 @@
pages={658--666},
year={2019}
}
@article{sobel,
title={Lane detection based on object segmentation and piecewise fitting},
author={Mu, Chunyang and Ma, Xing},
journal={TELKOMNIKA Indonesian Journal of Electrical Engineering},
volume={12},
number={5},
pages={3491--3500},
year={2014}
}
@article{mu2014lane,
title={Lane detection based on object segmentation and piecewise fitting},
author={Mu, Chunyang and Ma, Xing},
journal={TELKOMNIKA Indonesian Journal of Electrical Engineering},
volume={12},
number={5},
pages={3491--3500},
year={2014}
}
@article{deephough,
title={Deep hough transform for semantic line detection},
author={Zhao, Kai and Han, Qi and Zhang, Chang-Bin and Xu, Jun and Cheng, Ming-Ming},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
volume={44},
number={9},
pages={4793--4806},
year={2021},
publisher={IEEE}
}
@article{deformabledetr,
title={Deformable detr: Deformable transformers for end-to-end object detection},
author={Zhu, Xizhou and Su, Weijie and Lu, Lewei and Li, Bin and Wang, Xiaogang and Dai, Jifeng},
journal={arXiv preprint arXiv:2010.04159},
year={2020}
}
@inproceedings{dualassign,
title={A dual weighting label assignment scheme for object detection},
author={Li, Shuai and He, Chenhang and Li, Ruihuang and Zhang, Lei},
booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
pages={9387--9396},
year={2022}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 694 KiB

After

Width:  |  Height:  |  Size: 1.3 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 MiB

After

Width:  |  Height:  |  Size: 1.6 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 MiB

After

Width:  |  Height:  |  Size: 1.6 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 621 KiB

After

Width:  |  Height:  |  Size: 624 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 MiB

After

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

Binary file not shown.