update
This commit is contained in:
parent
1043faa952
commit
77575e82e3
218
main.tex
218
main.tex
@ -29,7 +29,7 @@
|
|||||||
|
|
||||||
\begin{document}
|
\begin{document}
|
||||||
|
|
||||||
\title{PolarRCNN:\@ End-to-End Lane Detection with Fewer Anchors}
|
\title{Polar R-CNN:\@ End-to-End Lane Detection with Fewer Anchors}
|
||||||
|
|
||||||
\author{Shengqi Wang and Junmin Liu\\
|
\author{Shengqi Wang and Junmin Liu\\
|
||||||
|
|
||||||
@ -43,8 +43,8 @@
|
|||||||
|
|
||||||
% The paper headers
|
% The paper headers
|
||||||
% The paper headers
|
% The paper headers
|
||||||
\markboth{S. Wang \MakeLowercase{\textit{et al.}}: PolarRCNN:\@ End-to-End Lane Detection with Fewer Anchors}%
|
\markboth{S. Wang \MakeLowercase{\textit{et al.}}: Polar R-CNN:\@ End-to-End Lane Detection with Fewer Anchors}%
|
||||||
{S. Wang \MakeLowercase{\textit{et al.}}: PolarRCNN:\@ End-to-End Lane Detection with Fewer Anchors}
|
{S. Wang \MakeLowercase{\textit{et al.}}: Polar R-CNN:\@ End-to-End Lane Detection with Fewer Anchors}
|
||||||
|
|
||||||
% \IEEEpubid{0000--0000/00\$00.00~\copyright~2021 IEEE}
|
% \IEEEpubid{0000--0000/00\$00.00~\copyright~2021 IEEE}
|
||||||
% Remember, if you use this you must call \IEEEpubidadjcol in the second
|
% Remember, if you use this you must call \IEEEpubidadjcol in the second
|
||||||
@ -53,7 +53,7 @@
|
|||||||
\maketitle
|
\maketitle
|
||||||
|
|
||||||
\begin{abstract}
|
\begin{abstract}
|
||||||
Lane detection is a critical and challenging task in autonomous driving, particularly in real-world scenarios where traffic lanes are often slender, lengthy, and partially obscured by other vehicles, complicating detection efforts. Existing anchor-based methods typically rely on prior straight line anchors to extract features and refine lane location and shape. Though achieving high performance, manually setting prior anchors is cumbersome, and ensuring adequate coverage across diverse datasets often requires a large number of dense anchors. Additionally, Non-Maximum Suppression (NMS) is used to suppress redundant predictions, which complicates real-world deployment and may fail in dense scenarios. In this study, we introduce PolarRCNN, a nms-free anchor-based method for lane detection. By incorporating both local and global polar coordinate systems, PolarRCNN enables flexible anchor proposals and significantly reduces the number of anchors required without compromising performance. Additionally, we introduce a heuristic GNN-based NMS-free head that supports an end-to-end paradigm, making the model more deployment-friendly and enhancing performance in dense scenarios. Our method achieves competitive results on five popular lane detection benchmarks—Tusimple, CULane, LLAMAS, CurveLanes, and DL-Rail—while maintaining a lightweight design and straightforward structure. Our source code are available at \href{https://github.com/ShqWW/PolarRCNN}{\textit{https://github.com/ShqWW/PolarRCNN}}.
|
Lane detection is a critical and challenging task in autonomous driving, particularly in real-world scenarios where traffic lanes are often slender, lengthy, and partially obscured by other vehicles, complicating detection efforts. Existing anchor-based methods typically rely on prior Lane anchors to extract features and refine lane location and shape. Though achieving high performance, manually setting prior anchors is cumbersome, and ensuring adequate coverage across diverse datasets often requires a large number of dense anchors. Additionally, non-maximum suppression is used to suppress redundant predictions, which complicates real-world deployment and may fail in dense scenarios. In this study, we introduce Polar R-CNN, a NMS-free anchor-based method for lane detection. By incorporating both local and global polar coordinate systems, Polar R-CNN enables flexible anchor proposals and significantly reduces the number of anchors required without compromising performance. Additionally, we introduce a heuristic GNN-based NMS-free head that supports an end-to-end paradigm, making the model more deployment-friendly and enhancing performance in dense scenarios. Our method achieves competitive results on five popular lane detection benchmarks—Tusimple, CULane, LLAMAS, CurveLanes, and DL-Rail—while maintaining a lightweight design and straightforward structure. Our source code are available at \href{https://github.com/ShqWW/PolarRCNN}{\textit{https://github.com/ShqWW/Polar R-CNN}}.
|
||||||
\end{abstract}
|
\end{abstract}
|
||||||
\begin{IEEEkeywords}
|
\begin{IEEEkeywords}
|
||||||
Lane detection, NMS-free, Graph neural network, Polar coordinate system.
|
Lane detection, NMS-free, Graph neural network, Polar coordinate system.
|
||||||
@ -62,9 +62,9 @@ Lane detection, NMS-free, Graph neural network, Polar coordinate system.
|
|||||||
\section{Introduction}
|
\section{Introduction}
|
||||||
\IEEEPARstart{L}{ane} detection is a significant problem in computer vision and autonomous driving, forming the basis for accurately perceiving the driving environment in intelligent driving systems. While extensive research has been conducted in ideal environments, it remains a challenging task in adverse scenarios such as night driving, glare, crowd, and rainy conditions, where lanes may be occluded or damaged. Moreover, the slender shapes, complex topologies of lanes and the global property add to the complexity of detection challenges. An effective lane detection method should take into account both global high-level semantic features and local low-level features to address these varied conditions and ensure robust performance in real-time applications such as autonomous driving.
|
\IEEEPARstart{L}{ane} detection is a significant problem in computer vision and autonomous driving, forming the basis for accurately perceiving the driving environment in intelligent driving systems. While extensive research has been conducted in ideal environments, it remains a challenging task in adverse scenarios such as night driving, glare, crowd, and rainy conditions, where lanes may be occluded or damaged. Moreover, the slender shapes, complex topologies of lanes and the global property add to the complexity of detection challenges. An effective lane detection method should take into account both global high-level semantic features and local low-level features to address these varied conditions and ensure robust performance in real-time applications such as autonomous driving.
|
||||||
|
|
||||||
Traditional methods predominantly concentrate on handcrafted local feature extraction and lane shape modeling. Techniques such as the Canny edge detector\cite{cannyedge}, Hough transform\cite{houghtransform}, and deformable templates for lane fitting\cite{kluge1995deformable} have been extensively utilized. Nevertheless, these approaches often encounter limitations in practical settings, particularly when low-level and local features lack clarity or distinctiveness.
|
Traditional methods predominantly concentrate on handcrafted local feature extraction and lane shape modeling. Techniques such as the Canny edge detector\cite{cannyedge}, Hough transform\cite{houghtransform}, and deformable templates for lane fitting\cite{kluge1995deformable} have been extensively utilized. Nevertheless, these approaches often encounter limitations in practical settings, particularly when low-level and local features lack clarity and distinctiveness.
|
||||||
|
|
||||||
In recent years, fueled by advancements in deep learning and the availability of large datasets, significant strides have been made in lane detection. Deep models, including convolutional neural networks (CNNs) and transformer-based architectures, have propelled progress in this domain. Previous approaches often treated lane detection as a segmentation task, albeit with simplicity came time-intensive computations. Some methods relied on parameter-based models, directly outputting lane curve parameters instead of pixel locations. These models offer end-to-end solutions, but the curve parameter sensitivity to lane shape compromises robustness.
|
In recent years, fueled by advancements in deep learning and the availability of large datasets, significant strides have been made in lane detection. Deep models, including convolutional neural networks (CNNs) and transformer-based architectures, have propelled progress in this domain. Previous approaches often treated lane detection as a segmentation task, which, despite its simplicity, involved time-consuming computations. Some methods relied on parameter-based models, directly outputting lane curve parameters instead of pixel locations. These models offer end-to-end solutions, but the curve parameter sensitivity to lane shape compromises robustness.
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\centering
|
\centering
|
||||||
@ -89,7 +89,7 @@ In recent years, fueled by advancements in deep learning and the availability of
|
|||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/anchor_demo/gt.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/anchor_demo/gt.jpg}
|
||||||
\caption{}
|
\caption{}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\caption{Compare with the anchor setting with other methods. (a) The initial anchor settings of CLRNet. (b) The learned anchor settings of CLRNet trained on CULane. (c) The proposed anchors of our method. (d) The ground truth.}
|
\caption{Anchor settings of different methods. (a) The initial anchor settings of CLRNet. (b) The learned anchor settings of CLRNet trained on CULane. (c) The proposed anchors of our method. (d) The ground truth.}
|
||||||
\label{anchor setting}
|
\label{anchor setting}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
@ -118,59 +118,59 @@ In recent years, fueled by advancements in deep learning and the availability of
|
|||||||
\caption{}
|
\caption{}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
|
|
||||||
\caption{Comparision between different anchor thresholds in different scenarios. (a) Ground truth in dense scenario. (b) Predictions with large nms thresholds in dense scenario. (c) Ground truth in sparse scenario. (d) Predictions with small nms threshold in sparse scenario.}
|
\caption{Comparison of different anchor thresholds in sparse and dense scenarios. (a) Ground truth in a dense scenario. (b) Predictions with large NMS thresholds in a dense scenario. (c) Ground truth in a sparse scenario. (d) Predictions with small NMS threshold in a sparse scenario.}
|
||||||
\label{nms setting}
|
\label{NMS setting}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Drawing inspiration from object detection methods such as Yolos \cite{yolov10} and Faster RCNN \cite{fasterrcnn}, several anchor-based approaches have been introduced for lane detection, the representative work including LaneATT \cite{laneatt} and CLRNet \cite{clrnet}. These methods have demonstrated superior performance by leveraging anchor priors and enabling larger receptive fields for feature extraction. However, anchor-based methods encounter similar drawbacks as anchor-based general object detection method as follows:
|
Drawing inspiration from object detection methods such as Yolos \cite{yolov10} and Faster R-CNN \cite{fasterrcnn}, several anchor-based approaches have been introduced for lane detection, the representative works including LaneATT \cite{laneatt} and CLRNet \cite{clrnet}. These methods have demonstrated superior performance by leveraging anchor priors and enabling larger receptive fields for feature extraction. However, anchor-based methods encounter similar drawbacks as anchor-based general object detection method as follows:
|
||||||
|
|
||||||
(1) A large amount of lane anchors are set among the image even in sparse scenarios.
|
(1) A large amount of lane anchors are set among the image even in sparse scenarios.
|
||||||
|
|
||||||
(2) Non-maximum suppression (NMS) postprocessing is necessary for the remove of redundant prediction but may fail in dense scenarios.
|
(2) Non-maximum suppression (NMS) post-processing is necessary for the remove of redundant prediction but may fail in dense scenarios.
|
||||||
|
|
||||||
Regrading the first issue, \cite{clrnet} introduced learned anchors, where the anchor parameters are optimized during training to adapt to the lane distributions (see Fig. \ref{anchor setting} (b)) in real dataset. Additionally, they employ cascade cross-layer anchor refinement to bring the anchors closer to the ground truth. However, the anchors are still numerous to cover the potential distributions of lanes. Moving further, \cite{adnet} proposes flexible anchors for each image by generating start points, rather than using a fixed set of anchors for all images. Nevertheless, the start points of lanes are subjective and lack clear visual evidence due to the global nature of lanes, which affects its performance. \cite{srlane} uses a local angle map to propose sketch anchors according to the direction of ground truth. This approach only considers the direction and neglects the accurate positioning of anchors, resulting in suboptimal performance without cascade anchor refinement. Overall, numerous anchors are unnecessary in sparse scenarios (where lane ground truths are sparse). The trend in newly proposed methods is to reduce the number of anchors and offer more flexible anchor configurations.
|
Regrading the first issue, \cite{clrnet} introduced learned anchors, where the anchor parameters are optimized during training to adapt to the lane distributions (see Fig. \ref{anchor setting} (b)) in real dataset. Additionally, they employ cascade cross-layer anchor refinement to bring the anchors closer to the ground truth. However, the anchors are still numerous to cover the potential distributions of lanes. Moving further, \cite{adnet} proposes flexible anchors for each image by generating start points, rather than using a fixed set of anchors for all images. Nevertheless, the start points of lanes are subjective and lack clear visual evidence due to the global nature of lanes, which affects its performance. \cite{srlane} uses a local angle map to propose sketch anchors according to the direction of ground truth. This approach only considers the direction and neglects the accurate positioning of anchors, resulting in suboptimal performance without cascade anchor refinement. Overall, numerous anchors are unnecessary in sparse scenarios (where lane ground truths are sparse). The trend in newly proposed methods is to reduce the number of anchors and offer more flexible anchor configurations.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Regarding the second issue, nearly all anchor-based methods (including those mentioned above) require direct or indirect Non-Maximum Suppression (NMS) post-processing to eliminate redundant predictions. Although it is necessary to eliminate redundant predictions, NMS remains a suboptimal solution. On the one hand, NMS is not deployment-friendly because it involves defining and calculating distances (e.g., Intersection over Union) between lane pairs. This is more challenging than bounding boxes in general object detection due to the complexity of lane geometry. On the other hand, NMS fails in some dense scenarios where the lane ground truths are closer together compared to sparse scenarios. A larger distance threshold may result in false negatives, as some true positive predictions might be eliminated (as shown in Fig. \ref{nms setting} (a) and (b)) by mistake. Conversely, a smaller distance threshold may not eliminate redundant predictions effectively and can leave false positives (as shown in Fig. \ref{nms setting} (c) and (d)). Achieving an optimal trade-off in all scenarios by manually setting the distance threshold is challenging. The root cause of this problem is that the distance definition in NMS considers only geometric parameters while ignoring the semantic context in the image. Thus, when two predictions are “close” to each other, it is nearly impossible to determine whether one of them is redundant.
|
Regarding the second issue, nearly all anchor-based methods (including those mentioned above) require direct or indirect NMS post-processing to eliminate redundant predictions. Although it is necessary to eliminate redundant predictions, NMS remains a suboptimal solution. On the one hand, NMS is not deployment-friendly because it involves defining and calculating distances (e.g., Intersection over Union) between lane pairs. This is more challenging than bounding boxes in general object detection due to the complexity of lane geometry. On the other hand, NMS fails in some dense scenarios where the lane ground truths are closer together compared to sparse scenarios. A large distance threshold may result in false negatives, as some true positive predictions might be eliminated (as shown in Fig. \ref{NMS setting} (a) and (b)) by mistake. Conversely, a small distance threshold may not eliminate redundant predictions effectively and can leave false positives (as shown in Fig. \ref{NMS setting} (c) and (d)). Achieving an optimal trade-off in all scenarios by manually setting the distance threshold is challenging. The root cause of this problem is that the distance definition in NMS considers only geometric parameters while ignoring the semantic context in the image. Thus, when two predictions are “close” to each other, it is nearly impossible to determine whether one of them is redundant.
|
||||||
|
|
||||||
To address the two issues outlined above, we propose PolarRCNN, a novel anchor-based method for lane detection. For the first issue, we introduce local and global heads based on the polar coordinate system to create anchors with more accurate locations and reduce the number of proposed anchors in sparse scenarios, as illustrated in Fig. \ref{anchor setting} (c). Compared to state-of-the-art previous work \cite{clrnet}\cite{clrernet} which uses 192 anchors, PolarRCNN employs only 20 anchors to cover potential lane ground truths. For the second issue, we have revised FastNMS to Graph-based FastNMS and introduced a new heuristic graph neural network block (Polar GNN block) integrated into the non-maximum suppression (NMS) head. The Polar GNN block offers a more interpretable structure compared to traditional NMS, achieving nearly equivalent performance in sparse scenarios and superior performance in dense scenarios. We conducted experiments on five major benchmarks: TuSimple \cite{tusimple}, CULane \cite{scnn}, LLAMAS \cite{llamas}, CurveLanes \cite{curvelanes}, and DL-Rail \cite{dalnet}. Our proposed method demonstrates competitive performance compared to state-of-the-art methods.
|
To address the two issues outlined above, we propose Polar R-CNN, a novel anchor-based method for lane detection. For the first issue, we introduce local and global heads based on the polar coordinate system to create anchors with more accurate locations and reduce the number of proposed anchors in sparse scenarios, as illustrated in Fig. \ref{anchor setting} (c). Compared to state-of-the-art previous work \cite{clrnet}\cite{clrernet} which uses 192 anchors, Polar R-CNN employs only 20 anchors to cover potential lane ground truths. For the second issue, we have revised Fast NMS to Graph-based Fast NMS and introduced a new heuristic graph neural network block (Polar GNN block) integrated into the NMS head. The Polar GNN block offers an interpretable structure, achieving nearly equivalent performance in sparse scenarios and superior performance in dense scenarios. We conducted experiments on five major benchmarks: TuSimple \cite{tusimple}, CULane \cite{scnn}, LLAMAS \cite{llamas}, CurveLanes \cite{curvelanes}, and DL-Rail \cite{dalnet}. Our proposed method demonstrates competitive performance compared to state-of-the-art methods.
|
||||||
|
|
||||||
Our main contributions are summarized as follows:
|
Our main contributions are summarized as follows:
|
||||||
|
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
\item We simplified the anchor parameters using local and global polar coordinate systems and applied them to two-stage lane detection frameworks. Compared to other anchor-based methods, the number of proposed anchors is greatly reduced while achieving better performance.
|
\item We simplified the anchor parameters using local and global polar coordinate systems and applied them to two-stage lane detection frameworks. Compared to other anchor-based methods, the number of proposed anchors is greatly reduced while achieving better performance.
|
||||||
\item We introduced a novel heuristic Polar GNN block to implement an NMS-free paradigm. The GNN architecture is designed with reference to Graph-based FastNMS, providing interpretability. Our model supports end-to-end training and testing, but traditional NMS postprocessing can still be used as an option for an NMS version of our model.
|
\item We introduced a novel heuristic Polar GNN block to implement a NMS-free paradigm. The GNN architecture is designed with reference to Graph-based Fast NMS, providing interpretability. Our model supports end-to-end training and testing, but traditional NMS post-processing can still be used as an option for a NMS version of our model.
|
||||||
\item Our method utilizes two-stage architectures and achieves competitive performance compared to state-of-the-art methods across five datasets. The high performance with fewer anchors and an NMS-free paradigm demonstrates the effectiveness of our approach. Additionally, our model is designed with a straightforward structure (without cascade refinement or attention strategies), which simplifies deployment.
|
\item Our method utilizes two-stage architectures and achieves competitive performance compared to state-of-the-art methods across five datasets. The high performance with fewer anchors and a NMS-free paradigm demonstrates the effectiveness of our approach. Additionally, our model is designed with a straightforward structure (without cascade refinement or attention strategies), which simplifies deployment.
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
|
|
||||||
\section{Related Works}
|
\section{Related Works}
|
||||||
The lane detection aims to detect lane instances in a image. In this section, we only introduce deep-leanrning based methods for lane detection. The lane detection methods can be categorized by segmentation based, parameter-based methods and anchor-based methods.
|
The lane detection aims to detect lane instances in an image. In this section, we only introduce deep-leanrning based methods for lane detection. The lane detection methods can be categorized by segmentation based, parameter-based methods and anchor-based methods.
|
||||||
|
|
||||||
\textbf{Segmentation-based Methods.} Segmentation-based methods focus on pixel-wise prediction. They predefined each pixel into different categories according to different lane instances and background\cite{lanenet} and predicted information pixel by pixel. However, these methods overly focus on low-level and local features, neglecting global semantic information and real-time detection. SCNN uses a larger receptive field to overcome this problem. Some methods such as UFLDv1 and v2\cite{ufld}\cite{ufldv2} and CondLaneNet\cite{condlanenet} utilize row-wise or column-wise classification instead of pixel classification to improve detection speed. Another issue with these methods is that the lane instance prior is learned by the model itself, leading to a lack of prior knowledge. Lanenet uses post-clustering to distinguish each lane instance. UFLD divides lane instances by angles and locations and can only detect a fixed number of lanes. CondLaneNet utilizes different conditional dynamic kernels to predict different lane instances. Some methods such as FOLOLane\cite{fololane} and GANet\cite{ganet} use bottom-up strategies to detect a few key points and model their global relations to form lane instances.
|
\textbf{Segmentation-based Methods.} Segmentation-based methods focus on pixel-wise prediction. They predefined each pixel into different categories according to different lane instances and background\cite{lanenet} and predicted information pixel by pixel. However, these methods overly focus on low-level and local features, neglecting global semantic information and real-time detection. SCNN uses a larger receptive field to overcome this problem. Some methods such as UFLDv1 and v2\cite{ufld}\cite{ufldv2} and CondLaneNet\cite{CondLaneNet} utilize row-wise or column-wise classification instead of pixel classification to improve detection speed. Another issue with these methods is that the lane instance prior is learned by the model itself, leading to a lack of prior knowledge. Lanenet uses post-clustering to distinguish each lane instance. UFLD divides lane instances by angles and locations and can only detect a fixed number of lanes. CondLaneNet utilizes different conditional dynamic kernels to predict different lane instances. Some methods such as FOLOLane\cite{fololane} and GANet\cite{ganet} use bottom-up strategies to detect a few key points and model their global relations to form lane instances.
|
||||||
|
|
||||||
\textbf{Parameter-based Methods.} Instead of predicting a series of points locations or pixel classes, parameter-based methods directly generate the curve parameters of lane instances. PolyLanenet\cite{polylanenet} and LSTR\cite{lstr} consider the lane instance as a polynomial curve and output the polynomial coefficients directly. BézierLaneNet\cite{bezierlanenet} treats the lane instance as a Bézier curve and generates the locations of control points of the curve. BSLane uses B-Spline to describe the lane, and the curve parameters focus on the local shapes of lanes. Parameter-based methods are mostly end-to-end without postprocessing, which grants them faster speed. However, since the final visual lane shapes are sensitive to the lane shape, the robustness and generalization of parameter-based methods may be less than ideal.
|
\textbf{Parameter-based Methods.} Instead of predicting a series of points locations or pixel classes, parameter-based methods directly generate the curve parameters of lane instances. PolyLanenet\cite{polylanenet} and LSTR\cite{lstr} consider the lane instance as a polynomial curve and output the polynomial coefficients directly. BézierLaneNet\cite{bezierlanenet} treats the lane instance as a Bézier curve and generates the locations of control points of the curve. BSLane uses B-Spline to describe the lane, and the curve parameters focus on the local shapes of lanes. Parameter-based methods are mostly end-to-end without post-processing, which grants them faster speed. However, since the final visual lane shapes are sensitive to the lane shape, the robustness and generalization of parameter-based methods may be less than ideal.
|
||||||
|
|
||||||
|
|
||||||
\textbf{Anchor-Based Methods.} Inspired by general object detection methods like YOLO \cite{yolov10} and fasterrcnn \cite{fasterrcnn}, anchor-based approaches have been proposed for lane detection. Line-CNN is, to our knowledge, the earliest method that utilizes line anchors for detecting lanes. These lines are designed as rays emitted from the three edges (left, bottom, and right) of an image. However, the model’s receptive field is limited to the edges, making it slower compared to some other methods. LaneATT \cite{laneatt} improves upon this by employing anchor-based feature pooling to aggregate features along the entire line anchor, achieving faster speeds and better performance. Nevertheless, its grid sampling strategy and label assignment pose limitations. CLRNet \cite{clrnet} enhances anchor-based performance with cross-layer refinement strategies, SimOTA label assignment \cite{yolox}, and Liou loss, surpassing many previous methods. A key advantage of anchor-based methods is their adaptability, allowing the integration of strategies from anchor-based general object detection, such as label assignment, bounding box refinement, and GIOU loss. However, existing anchor-based lane detection methods also have notable drawbacks. Line anchors are often handcrafted and numerous, which can be cumbersome. Some approaches, such as ADNet \cite{adnet}, SRLane \cite{srlane}, and Sparse Laneformer \cite{sparse}, attempt to reduce the number of anchors and provide proposals, but this can slightly impact performance. Additionally, methods such as \cite{clrernet} \cite{adnet} still rely on NMS postprocessing, complicating NMS threshold settings and model deployment. Although one-to-one label assignment (during training) without NMS \cite{detr}\cite{o2o} (during evaluation) alleviates this issue, its performance remains less satisfactory compared to NMS-based models.
|
\textbf{Anchor-Based Methods.} Inspired by general object detection methods like YOLO \cite{yolov10} and Faster R-CNN \cite{fasterrcnn}, anchor-based approaches have been proposed for lane detection. Line-CNN is, to our knowledge, the earliest method that utilizes line anchors for detecting lanes. These lines are designed as rays emitted from the three edges (left, bottom, and right) of an image. However, the model’s receptive field is limited to the edges, which makes it suboptimal for capturing the entire lane. LaneATT \cite{laneatt} improves upon this by employing anchor-based feature pooling to aggregate features along the entire line anchor, achieving faster speeds and better performance. Nevertheless, its grid sampling strategy and label assignment pose limitations. CLRNet \cite{clrnet} enhances anchor-based performance with cross-layer refinement strategies, SimOTA label assignment \cite{yolox}, and Liou loss, surpassing many previous methods. A key advantage of anchor-based methods is their adaptability, allowing the integration of strategies from anchor-based general object detection, such as label assignment, bounding box refinement, and GIOU loss. However, existing anchor-based lane detection methods also have notable drawbacks. Lane anchors are often handcrafted and numerous, which can be cumbersome. Some approaches, such as ADNet \cite{adnet}, SRLane \cite{srlane}, and Sparse Laneformer \cite{sparse}, attempt to reduce the number of anchors and provide flexible proposals, but this can slightly impact performance. Additionally, methods such as \cite{clrernet} \cite{adnet} still rely on NMS post-processing, complicating NMS threshold settings and model deployment. Although one-to-one label assignment (during training) without NMS \cite{detr}\cite{o2o} (during evaluation) alleviates this issue, its performance remains less satisfactory compared to NMS-based models.
|
||||||
|
|
||||||
\begin{figure*}[ht]
|
\begin{figure*}[ht]
|
||||||
\centering
|
\centering
|
||||||
\includegraphics[width=\linewidth]{thsis_figure/ovarall_architecture.png} % 替换为你的图片文件名
|
\includegraphics[width=\linewidth]{thsis_figure/ovarall_architecture.png} % 替换为你的图片文件名
|
||||||
\caption{The overall pipeline of PolarRCNN. The architecture is simple and lightweight. The backbone (e.g. ResNet18) and FPN aims to extract feature of the image. And the Local polar head aims to proposed sparse line anchors. After pooling features sample along the line anchors, the global polar head give the final predictions. Trilet subheads are set in the Global polar Head, including an one-to-one classification head (O2O cls head), an one-to-many classification head (o2m cls head) and an one-to-many regression head (o2m Reg Head). The one-to-one cls head aim to replace the NMS postprocessing and select only one positive prediction sample for each ground truth from the redundant predictions from the o2m head.}
|
\caption{The overall pipeline of Polar R-CNN. The architecture is simple and lightweight. The backbone (e.g. ResNet18) and FPN aims to extract feature of the image. And the Local polar head aims to proposed sparse line anchors. After pooling features sample along the line anchors, the global polar head give the final predictions. Triplet head is set in the Global polar Head, including an one-to-one classification head (O2O cls head), an one-to-many classification head (o2m cls head) and an one-to-many regression head (O2M reg Head). The one-to-one cls head aim to replace the NMS post-processing and select only one positive prediction sample for each ground truth from the redundant predictions from the o2m head.}
|
||||||
\label{overall_architecture}
|
\label{overall_architecture}
|
||||||
\end{figure*}
|
\end{figure*}
|
||||||
|
|
||||||
\textbf{NMS-Free Object Detections}. Non-Maximum Suppression (NMS) is an important postprocessing step in most general object detection methods. Detr \cite{detr} employs one-to-one label assignment to avoid redundant predictions without using NMS. Other NMS-free methods \cite{learnnms} have also been proposed, addressing this issue from two aspects: model architecture and label assignment. Studies \cite{date} \cite{yolov10} suggest that one-to-one assignments are crucial for NMS-free predictions, but maintaining one-to-many assignments is still necessary to ensure effective feature learning of the model. Other works \cite{o3d} \cite{relationnet} consider the model’s expressive capacity to provide non-redundant predictions. However, few studies have analyzed the NMS-free paradigm for anchor-based lane detection methods as thoroughly as in general object detection. Most anchor-based lane detection methods still rely on NMS postprocessing. In our work, besides label assignment, we extend the analysis to the detection head’s structure, focusing on achieving non-redundant (NMS-free) lane predictions.
|
\textbf{NMS-Free Object Detections}. Non-maximum muppression (NMS) is an important post-processing step in most general object detection methods. Detr \cite{detr} employs one-to-one label assignment to avoid redundant predictions without using NMS. Other NMS-free methods \cite{learnNMS} have also been proposed, addressing this issue from two aspects: model architecture and label assignment. Studies \cite{date} \cite{yolov10} suggest that one-to-one assignments are crucial for NMS-free predictions, but maintaining one-to-many assignments is still necessary to ensure effective feature learning of the model. Other works \cite{o3d} \cite{relationnet} consider the model’s expressive capacity to provide non-redundant predictions. However, few studies have analyzed the NMS-free paradigm for anchor-based lane detection methods as thoroughly as in general object detection. Most anchor-based lane detection methods still rely on NMS post-processing. In our work, besides label assignment, we extend the analysis to the detection head’s structure, focusing on achieving non-redundant (NMS-free) lane predictions.
|
||||||
|
|
||||||
In this work, we aim to address to two issues in anchor-based lane detection mentioned above, the sparse lane anchor setting and NMS-free predictions.
|
In this work, we aim to address to two issues in anchor-based lane detection mentioned above, the sparse lane anchor setting and NMS-free predictions.
|
||||||
|
|
||||||
\section{Proposed method}
|
\section{Proposed method}
|
||||||
The overall architecture of PolarRCNN is illustrated in Fig. \ref{overall_architecture}. Our model adheres to the Faster R-CNN \cite{fasterrcnn} framework, consisting of a backbone, FPN (Feature Pyramid Network), RPN (Region Proposal Network), and RoI (Region of Interest) pooling. To investigate the fundamental factors affecting model performance, such as anchor settings and NMS (Non-Maximum Suppression) postprocessing, and make the model easier to deploy, PolarRCNN employs a simple and straightforward network structure. It relies on basic components including convolutional layers, MLPs (Multi-Layer Perceptrons), and pooling operations, deliberately excluding advanced elements like attention mechanisms, dynamic kernels, and cross-layer refinement used in pervious works \cite{clrnet}\cite{clrernet}.
|
The overall architecture of Polar R-CNN is illustrated in Fig. \ref{overall_architecture}. Our model adheres to the Faster R-CNN \cite{fasterrcnn} framework, consisting of a backbone, Feature Pyramid Network (FPN), Region Proposal Network (RPN), and Region of Interest (RoI) pooling. To investigate the fundamental factors affecting model performance, such as anchor settings and NMS post-processing, and make the model easier to deploy, Polar R-CNN employs a simple and straightforward network structure. It relies on basic components including convolutional layers, Multi-Layer Perceptrons (MLPs), and pooling operations, deliberately excluding advanced elements like attention mechanisms, dynamic kernels, and cross-layer refinement used in pervious works \cite{clrnet}\cite{clrernet}.
|
||||||
|
|
||||||
\begin{table}[h]
|
\begin{table}[h]
|
||||||
\centering
|
\centering
|
||||||
@ -181,13 +181,13 @@ The overall architecture of PolarRCNN is illustrated in Fig. \ref{overall_archit
|
|||||||
\textbf{Variable} & \textbf{Type} & \hspace{10em}\textbf{Defination} \\
|
\textbf{Variable} & \textbf{Type} & \hspace{10em}\textbf{Defination} \\
|
||||||
\midrule
|
\midrule
|
||||||
$\mathbf{P}_{i}$ & tensor& The $i_{th}$ output feature map from FPN\\
|
$\mathbf{P}_{i}$ & tensor& The $i_{th}$ output feature map from FPN\\
|
||||||
$H^{L}$& scalar& The height of the local polar map\\
|
$H^{l}$& scalar& The height of the local polar map\\
|
||||||
$W^{L}$& scalar& The weight of the local polar map\\
|
$W^{l}$& scalar& The weight of the local polar map\\
|
||||||
$K_{A}$ & scalar& The number of anchors selected during evaluation\\
|
$K_{a}$ & scalar& The number of anchors selected during evaluation\\
|
||||||
$\mathbf{c}^{G}$& tensor& The origin point of global polar coordinate\\
|
$\mathbf{c}^{g}$& tensor& The origin point of global polar coordinate\\
|
||||||
$\mathbf{c}^{L}$& tensor& The origin point of local polar coordinate\\
|
$\mathbf{c}^{l}$& tensor& The origin point of local polar coordinate\\
|
||||||
$r^{G}_{i}$& scalar& The $i_{th}$ anchor radius under global polar coordinate\\
|
$r^{g}_{i}$& scalar& The $i_{th}$ anchor radius under global polar coordinate\\
|
||||||
$r^{L}_{i}$& scalar& The $i_{th}$ anchor radius under global polar coordinate\\
|
$r^{l}_{i}$& scalar& The $i_{th}$ anchor radius under global polar coordinate\\
|
||||||
$\theta_{i}$& scalar& The $i_{th}$ anchor angle under global/local polar coordinate\\
|
$\theta_{i}$& scalar& The $i_{th}$ anchor angle under global/local polar coordinate\\
|
||||||
\midrule
|
\midrule
|
||||||
$\mathbf{X}^{pool}_{i}$& tensor& The pooling feature of the $i_{th}$ anchor\\
|
$\mathbf{X}^{pool}_{i}$& tensor& The pooling feature of the $i_{th}$ anchor\\
|
||||||
@ -209,9 +209,9 @@ The overall architecture of PolarRCNN is illustrated in Fig. \ref{overall_archit
|
|||||||
|
|
||||||
\subsection{Lane and Line Anchor Representation}
|
\subsection{Lane and Line Anchor Representation}
|
||||||
|
|
||||||
Lanes are characterized by their thin and elongated curved shapes. A suitable lane prior aids the model in extracting features, predicting locations, and modeling the shapes of lane curves with greater accuracy. Consistent with previous studies \cite{linecnn}\cite{laneatt}, our lane priors (also referred to as lane anchors) consists of straight lines. We sample a sequence of 2D points along each lane anchor, denoted as $ P\doteq \left\{ \left( x_1, y_1 \right) , \left( x_2, y_2 \right) , ....,\left( x_n, y_n \right) \right\} $, where N is the number of sampled points. The y-coordinates of these points are uniformly sampled from the vertical axis of the image, specifically $y_i=\frac{H}{N-1}*i$, where H is the image height. These y-coordinates are also sampled from the ground truth lane, and the model is tasked with regressing the x-coordinate offset from the line anchor to the lane instance ground truth. The primary distinction between PolarRCNN and previous approaches lies in the description of the lane anchors (straight line), which will be detailed in the following sections.
|
Lanes are characterized by their thin and elongated curved shapes. A suitable lane prior aids the model in extracting features, predicting locations, and modeling the shapes of lane curves with greater accuracy. Consistent with previous studies \cite{linecnn}\cite{laneatt}, our lane priors (also referred to as lane anchors) consists of straight lines. We sample a sequence of 2D points along each lane anchor, denoted as $ P\doteq \left\{ \left( x_1, y_1 \right) , \left( x_2, y_2 \right) , ....,\left( x_n, y_n \right) \right\} $, where N is the number of sampled points. The y-coordinates of these points are uniformly sampled from the vertical axis of the image, specifically $y_i=\frac{H}{N-1}*i$, where H is the image height. These y-coordinates are also sampled from the ground truth lane, and the model is tasked with regressing the x-coordinate offset from the line anchor to the lane instance ground truth. The primary distinction between Polar R-CNN and previous approaches lies in the description of the lane anchors, which will be detailed in the following sections.
|
||||||
|
|
||||||
\textbf{Polar Coordinate system.} Since lane anchors are typically represented as straight lines, they can be described using straight line parameters. Previous approaches have used rays to describe 2D lane anchors, with the parameters including the coordinates of the starting point and the orientation/angle, denoted as $\left\{\theta, P_{xy}\right\}$, as shown in Fig. \ref{coord} (a). \cite{linecnn}\cite{laneatt} define the start points as lying on the three image boundaries. However, \cite{adnet} argue that this approach is problematic because the actual starting point of a lane could be located anywhere within the image. In our analysis, using a ray can lead to ambiguity in line representation because a line can have an infinite number of starting points, and the choice of the starting point for a lane is subjective. As illustrated in Fig. \ref{coord} (a), the yellow (the visual start point) and green (the point located on the image boundary) starting points with the same orientation $\theta$ describe the same line, and either could be used in different datasets \cite{scnn}\cite{vil100}. This ambiguity arises because a straight line has two degrees of freedom, whereas a ray has three. To resolve this ussue , we propose using polar coordinates to describe a lane anchor with only two parameters: radius and angle, deoted as $\left\{\theta, r\right\}$, where $\theta \in \left[-\frac{\pi}{2}, \frac{\pi}{2}\right)$ and $r \in \left(-\infty, +\infty\right)$. This representation isillustrated in Fig. \ref{coord} (b).
|
\textbf{Polar Coordinate system.} Since lane anchors are typically represented as straight lines, they can be described using straight line parameters. Previous approaches have used rays to describe 2D lane anchors, with the parameters including the coordinates of the starting point and the orientation/angle, denoted as $\left\{\theta, P_{xy}\right\}$, as shown in Fig. \ref{coord} (a). \cite{linecnn}\cite{laneatt} define the start points as lying on the three image boundaries. However, \cite{adnet} argue that this approach is problematic because the actual starting point of a lane could be located anywhere within the image. In our analysis, using a ray can lead to ambiguity in line representation because a line can have an infinite number of starting points, and the choice of the starting point for a lane is subjective. As illustrated in Fig. \ref{coord} (a), the yellow (the visual start point) and green (the point located on the image boundary) starting points with the same orientation $\theta$ describe the same line, and either could be used in different datasets \cite{scnn}\cite{vil100}. This ambiguity arises because a straight line has two degrees of freedom, whereas a ray has three (two for the start point and one for orientation). To resolve this issue , we propose using polar coordinates to describe a lane anchor with only two parameters: radius and angle, deoted as $\left\{\theta, r\right\}$, where $\theta \in \left[-\frac{\pi}{2}, \frac{\pi}{2}\right)$ and $r \in \left(-\infty, +\infty\right)$. This representation isillustrated in Fig. \ref{coord} (b).
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\centering
|
\centering
|
||||||
@ -231,7 +231,7 @@ Lanes are characterized by their thin and elongated curved shapes. A suitable la
|
|||||||
\label{coord}
|
\label{coord}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
We define two types of polar coordinate systems: the global coordinate system and the local coordinate system, with the origin points denoted as the global origin $\boldsymbol{c}^{G}$ and the local origin $\boldsymbol{c}^{L}$, respectively. For convenience, the global origin is positioned near the static vanishing point of the entire lane image dataset, while the local origins are set at lattice points within the image. As illustrated in Fig. \ref{coord} (b), only the radius parameters are affected by the choice of the origin point, while the angle/orientation parameters remain consistent.
|
We define two types of polar coordinate systems: the global coordinate system and the local coordinate system, with the origin points denoted as the global origin $\boldsymbol{c}^{g}$ and the local origin $\boldsymbol{c}^{l}$, respectively. For convenience, the global origin is positioned near the static vanishing point of the entire lane image dataset, while the local origins are set at lattice points within the image. As illustrated in Fig. \ref{coord} (b), only the radius parameters are affected by the choice of the origin point, while the angle/orientation parameters remain consistent.
|
||||||
|
|
||||||
\subsection{Local Polar Head}
|
\subsection{Local Polar Head}
|
||||||
|
|
||||||
@ -239,14 +239,14 @@ We define two types of polar coordinate systems: the global coordinate system an
|
|||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
&F_d\gets DS\left( P_{3} \right), \,F_d\in \mathbb{R} ^{C_f\times H^{L}\times W^{L}}\\
|
&F_d\gets DS\left( P_{3} \right), \,F_d\in \mathbb{R} ^{C_f\times H^{l}\times W^{l}}\\
|
||||||
&F_{reg\,\,}\gets \phi _{reg}^{lph}\left( F_d \right), \,F_{reg\,\,}\in \mathbb{R} ^{2\times H^{L}\times W^{L}}\\
|
&F_{reg\,\,}\gets \phi _{reg}^{lph}\left( F_d \right), \,F_{reg\,\,}\in \mathbb{R} ^{2\times H^{l}\times W^{l}}\\
|
||||||
&F_{cls}\gets \phi _{cls}^{lph}\left( F_d \right), \,F_{cls}\in \mathbb{R} ^{H^{L}\times W^{L}}
|
&F_{cls}\gets \phi _{cls}^{lph}\left( F_d \right), \,F_{cls}\in \mathbb{R} ^{H^{l}\times W^{l}}
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\label{lph equ}
|
\label{lph equ}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
The regression branch aims to propose lane anchors by predicting two parameters $F_{reg\,\,} \equiv \left\{\theta_{j}, r^{L}_{j}\right\}_{j=1}^{H^{L}\times W^{L}}$, within the local polar coordinate system. These parameters represent the angles and the radius.The classification branch, on the other hand, predicts the heat map $F_{cls\,\,}\left\{c_{j}\right\}_{j=1}^{H^{L}\times W^{L}}$ of the local polar origin grid. By discarding local origin points with lower confidence, the module increases the likelihood of selecting potential positive foreground lane anchors while removing background lane anchors to the greatest extent. Keeping it simple, the regression branch $\phi _{reg}^{lph}\left(\cdot \right)$ consists of one $1\times1$ convolutional layer while the classification branch $\phi _{cls}^{lph}\left(\cdot \right)$ consists of two $1\times1$ convolutional layers.
|
The regression branch aims to propose lane anchors by predicting two parameters $F_{reg\,\,} \equiv \left\{\theta_{j}, r^{l}_{j}\right\}_{j=1}^{H^{l}\times W^{l}}$, within the local polar coordinate system. These parameters represent the angles and the radius.The classification branch predicts the heat map $F_{cls\,\,}\left\{c_{j}\right\}_{j=1}^{H^{l}\times W^{l}}$ of the local polar origin grid. By discarding local origin points with lower confidence, the module increases the likelihood of selecting potential positive foreground lane anchors while removing background lane anchors to the greatest extent. Keeping it simple, the regression branch $\phi _{reg}^{lph}\left(\cdot \right)$ consists of one $1\times1$ convolutional layer while the classification branch $\phi _{cls}^{lph}\left(\cdot \right)$ consists of two $1\times1$ convolutional layers.
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\centering
|
\centering
|
||||||
@ -257,7 +257,7 @@ We define two types of polar coordinate systems: the global coordinate system an
|
|||||||
|
|
||||||
\textbf{Loss Function.} During the training phase, as illustrated in Fig. \ref{lphlabel}, the ground truth labels for the Local Polar Head (LPH) are constructed as follows. The radius ground truth is defined as the shortest distance from a grid point (local origin point) to the ground truth lane curve. The angle ground truth is defined as the orientation of the vector from the grid point to the nearest point on the curve. A grid point is designated as a positive sample if its radius label is less than a threshold $\tau_{L}$ ; otherwise, it is considered a negative sample.
|
\textbf{Loss Function.} During the training phase, as illustrated in Fig. \ref{lphlabel}, the ground truth labels for the Local Polar Head (LPH) are constructed as follows. The radius ground truth is defined as the shortest distance from a grid point (local origin point) to the ground truth lane curve. The angle ground truth is defined as the orientation of the vector from the grid point to the nearest point on the curve. A grid point is designated as a positive sample if its radius label is less than a threshold $\tau_{L}$ ; otherwise, it is considered a negative sample.
|
||||||
|
|
||||||
Once the regression and classification labels are established, the LPH can be trained using the smooth L1 loss $d\left(\cdot \right)$ for regression and the binary cross-entropy loss $BCE\left( \cdot , \cdot \right)$. The LPH loss function is defined as follows:
|
Once the regression and classification labels are established, the LPH can be trained using the smooth-L1 loss $d\left(\cdot \right)$ for regression and the binary cross-entropy loss $BCE\left( \cdot , \cdot \right)$ for classification. The LPH loss function is defined as follows:
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
@ -267,7 +267,7 @@ Once the regression and classification labels are established, the LPH can be tr
|
|||||||
\label{loss_lph}
|
\label{loss_lph}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
\textbf{Top-$K_{A}$ Anchor Selectoin}. During the training stage, all $H^{local}\times W^{local}$ anchors are considered as candidate anchors and fed into the R-CNN module. This approach helps the R-CNN module to learn from sufficient features of negative (background) anchor samples. In the evaluation stage, however, only the top-$K_{A}$ anchors with the highest confidence scores are selected and fed into the R-CNN module. This strategy is designed to filter out potential negative (background) anchors and reduce the computational complexity of the R-CNN module. By doing so, it maintains the adaptability and flexibility of anchor distribution while decreasing the total number of anchors. The following experiments will demonstrate the effectiveness of our top-$K_{A}$ anchor selection strategy.
|
\textbf{Top-$K_{a}$ Anchor Selectoin}. During the training stage, all $H^{l}\times W^{l}$ anchors are considered as candidate anchors and fed into the R-CNN module. This approach helps the R-CNN module to learn from sufficient features of negative (background) anchor samples. In the evaluation stage, however, only the top-$K_{a}$ anchors with the highest confidence scores are selected and fed into the R-CNN module. This strategy is designed to filter out potential negative (background) anchors and reduce the computational complexity of the R-CNN module. By doing so, it maintains the adaptability and flexibility of anchor distribution while decreasing the total number of anchors. The following experiments will demonstrate the effectiveness of our top-$K_{a}$ anchor selection strategy.
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\centering
|
\centering
|
||||||
@ -277,21 +277,21 @@ Once the regression and classification labels are established, the LPH can be tr
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\subsection{Global Polar Head.}
|
\subsection{Global Polar Head.}
|
||||||
Global polar head (GPH) is a crucial component in the second stage of PolarRCNN. It takes lane anchor pooling features as input and predicts the precise lane location and confidence. Fig. \ref{gph} illustrates the structure and pipeline of GPH. GPH comprises RoI pooling modules and three sub-heads (triplet heads), which will be introduced in detail.
|
Global polar head (GPH) is a crucial component in the second stage of Polar R-CNN. It takes lane anchor pooling features as input and predicts the precise lane location and confidence. Fig. \ref{gph} illustrates the structure and pipeline of GPH. GPH comprises RoI pooling modules and three subheads (triplet head module), which will be introduced in detail.
|
||||||
|
|
||||||
\textbf{RoI Pooling Module.} RoI pooling module is designed to transform features sampled from lane anchors into a standard feature tensor. Once the local polar parameters of a lane anchor are given, they can be converted to global polar coordinates using the following equation:
|
\textbf{RoI Pooling Module.} RoI pooling module is designed to transform features sampled from lane anchors into a standard feature tensor. Once the local polar parameters of a lane anchor are given, they can be converted to global polar coordinates using the following equation:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
r^{G}_{j}=r^{L}_{j}+\left( \textbf{c}^{L}_{j}-\textbf{c}^{G}_{j} \right) ^{T}\left[\cos\theta_{j}, \sin\theta_{j} \right]
|
r^{g}_{j}=r^{l}_{j}+\left( \textbf{c}^{l}_{j}-\textbf{c}^{g}_{j} \right) ^{T}\left[\cos\theta_{j}, \sin\theta_{j} \right]
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
where $\textbf{c}^{L}_{j} \in \mathbb{R}^{2}$ and $\textbf{c}^{G} \in \mathbb{R}^{2}$ represent the Cartesian coordinates of local and global origins correspondingly.
|
where $\textbf{c}^{l}_{j} \in \mathbb{R}^{2}$ and $\textbf{c}^{g} \in \mathbb{R}^{2}$ represent the Cartesian coordinates of local and global origins correspondingly.
|
||||||
|
|
||||||
Next, feature points are sampled on the lane anchor. The y-coordinates of these points are uniformly sampled vertically from the image, as previously mentioned. The $x_{i}$ coordinates are computed using the global polar axis with the following equation:
|
Next, feature points are sampled on the lane anchor. The y-coordinates of these points are uniformly sampled vertically from the image, as previously mentioned. The $x_{i}$ coordinates are computed using the global polar axis with the following equation:
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
x_{i\,\,}=-y_i\tan \theta +\frac{r^{G}}{\cos \theta}
|
x_{i\,\,}=-y_i\tan \theta +\frac{r^{g}}{\cos \theta}
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
@ -316,7 +316,7 @@ where $\boldsymbol{w}_{L}^{s}\in \mathbb{R} ^{N_p}$ represents the learnable agg
|
|||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
\textbf{Triplet Head.} The triplet head comprises three distinct heads: the one-to-one classification (O2O cls) head, the one-to-many classification (O2M cls) head, and the one-to-many regression (O2M Reg) head. In various studies \cite{laneatt}\cite{clrnet}\cite{adnet}\cite{srlane}, the detection head predominantly follows the one-to-many paradigm. During the training phase, multiple positive samples are assigned to a single ground truth. Consequently, during the evaluation stage, redundant detection results are often predicted for each instance. These redundancies are typically addressed using Non-Maximum Suppression (NMS), which eliminates duplicate results and retains the highest confidence detection. However, NMS relies on the definition of distance between detection results, and this calculation can be complex for curved lanes and other irregular geometric shapes. To achieve non-redundant detection results (NMS-free), the one-to-one paradigm becomes crucial during training, as highlighted in \cite{o2o}. Nevertheless, merely adopting the one-to-one paradigm is insufficient; the structure of the detection head also plays a pivotal role in achieving NMS-free detection. This aspect will be further analyzed in the following sections.
|
\textbf{Triplet Head.} The triplet head comprises three distinct heads: the one-to-one classification (O2O cls) head, the one-to-many classification (O2M cls) head, and the one-to-many regression (O2M reg) head. In various studies \cite{laneatt}\cite{clrnet}\cite{adnet}\cite{srlane}, the detection head predominantly follows the one-to-many paradigm. During the training phase, multiple positive samples are assigned to a single ground truth. Consequently, during the evaluation stage, redundant detection results are often predicted for each instance. These redundancies are typically addressed using NMS, which eliminates duplicate results and retains the highest confidence detection for each groung truth. However, NMS relies on the definition of distance between detection results, and this calculation can be complex for curved lanes and other irregular geometric shapes. To achieve non-redundant detection results with a NMS-free paradigm, the one-to-one paradigm becomes crucial during training, as highlighted in \cite{o2o}. Nevertheless, merely adopting the one-to-one paradigm is insufficient; the structure of the detection head also plays a pivotal role in achieving NMS-free detection. This aspect will be further analyzed in the following sections.
|
||||||
|
|
||||||
\begin{algorithm}[t]
|
\begin{algorithm}[t]
|
||||||
\caption{The Algorithm of the Graph-based Fast NMS}
|
\caption{The Algorithm of the Graph-based Fast NMS}
|
||||||
@ -326,7 +326,7 @@ where $\boldsymbol{w}_{L}^{s}\in \mathbb{R} ^{N_p}$ represents the learnable agg
|
|||||||
The positive corresponding anchors, $[\theta_i, r_{i}^{global}]$;\\
|
The positive corresponding anchors, $[\theta_i, r_{i}^{global}]$;\\
|
||||||
The x axis of sampling points from positive anchors, $\boldsymbol{x}_{i}^{b}$;\\
|
The x axis of sampling points from positive anchors, $\boldsymbol{x}_{i}^{b}$;\\
|
||||||
The positive confidence get from o2m cls head, $s_i$;\\
|
The positive confidence get from o2m cls head, $s_i$;\\
|
||||||
The positive regressions get from o2m Reg head, the horizontal offset $\varDelta \boldsymbol{x}_{i}^{roi}$ and end point location $\boldsymbol{e}_{i}$;\\
|
The positive regressions get from o2m reg head, the horizontal offset $\varDelta \boldsymbol{x}_{i}^{roi}$ and end point location $\boldsymbol{e}_{i}$;\\
|
||||||
\ENSURE ~~\\ %算法的输出:Output
|
\ENSURE ~~\\ %算法的输出:Output
|
||||||
\STATE Calculate the confidential adjacent matrix $\boldsymbol{C} \in \mathbb{R} ^{N_{pos} \times N_{pos}} $, where the element $C_{ij}$ in $\boldsymbol{C}$ is caculate as follows:
|
\STATE Calculate the confidential adjacent matrix $\boldsymbol{C} \in \mathbb{R} ^{N_{pos} \times N_{pos}} $, where the element $C_{ij}$ in $\boldsymbol{C}$ is caculate as follows:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
@ -379,7 +379,7 @@ where $\boldsymbol{w}_{L}^{s}\in \mathbb{R} ^{N_p}$ represents the learnable agg
|
|||||||
\label{gnn}
|
\label{gnn}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\textbf{NMS vs NMS-free.} Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from $i_{th}$ anchors and the three subheads using $\boldsymbol{F}^{roi}_{i}$ as input. For now, let us focus on the O2M classification (O2M cls) head and the O2M regression (O2M Reg) head, which follow the old paradigm used in previous work and can serve as a baseline for the new one-to-one paradigm. To maintain simplicity and rigor, both the O2M cls head and the O2M Reg head consist of two layers with activation functions, featuring a plain structure without any complex mechanisms such as attention or deformable convolution. s previously mentioned, merely replacing the one-to-many label assignment with one-to-one label assignment is insufficient for eliminating NMS postprocessing. This is because anchors often exhibit significant overlap or are positioned very close to each other, as shown in Fig. \ref{anchor setting} (b)(c). Let the $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ represent the features from two overlapping (or very close) anchors, implying that $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ will be almost identical. Let $f_{plain}^{cls}$ denotes the neural structure used in O2M cls head but trained with one-to-one label assignment. If $\boldsymbol{F}^{roi}_{i}$ is a positive sample and the $\boldsymbol{F}^{roi}_{j}$ is a negative sample, the ideal output should be as follows:
|
\textbf{NMS vs NMS-free.} Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from $i_{th}$ anchors and the three subheads using $\boldsymbol{F}^{roi}_{i}$ as input. For now, let us focus on the O2M classification (O2M cls) head and the O2M regression (O2M reg) head, which follow the old paradigm used in previous work and can serve as a baseline for the new one-to-one paradigm. To maintain simplicity and rigor, both the O2M cls head and the O2M reg head consist of two layers with activation functions, featuring a plain structure without any complex mechanisms such as attention or deformable convolution. as previously mentioned, merely replacing the one-to-many label assignment with one-to-one label assignment is insufficient for eliminating NMS post-processing. This is because anchors often exhibit significant overlap or are positioned very close to each other, as shown in Fig. \ref{anchor setting} (b)(c). Let the $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ represent the features from two overlapping (or very close) anchors, implying that $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ will be almost identical. Let $f_{plain}^{cls}$ denotes the neural structure used in O2M cls head and suppose it's trained with one-to-one label assignment. If $\boldsymbol{F}^{roi}_{i}$ is a positive sample and the $\boldsymbol{F}^{roi}_{j}$ is a negative sample, the ideal output should be as follows:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
&\boldsymbol{F}_{i}^{roi}\approx \boldsymbol{F}_{j}^{roi}
|
&\boldsymbol{F}_{i}^{roi}\approx \boldsymbol{F}_{j}^{roi}
|
||||||
@ -392,30 +392,30 @@ where $\boldsymbol{w}_{L}^{s}\in \mathbb{R} ^{N_p}$ represents the learnable agg
|
|||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
|
|
||||||
The equation \ref{sharp fun} suggests that the property of $f_{cls}^{plain}$ need to be "sharp" enough to differentiate between two similar features. That is to say, the output of $f_{cls}^{plain}$ changes repidly over short proids or distances, it implies that $f_{cls}^{plain}$ need to captures information with higher frequency. This issue is also discussed in \cite{o3d}. Capturing the high frequency with a plain structure is challenging because a naive MLP tends to capture information with lower frequency \cite{xu2022overview}. In the most extreme case, where $\boldsymbol{F}_{i}^{roi} = \boldsymbol{F}_{j}^{roi}$, it becomes impossible to distinguish the two anchors to positive and negative samples completely; in practice, both confidences converge to around 0.5. This problem arises from the limitations of the input format and the structure of the naive MLP, which restrict its expressive capability for information with higher frequency. Therefore, it is crucial to establish relationships between anchors and design a new model structure to effectively represent “sharp” information.
|
The equation \ref{sharp fun} suggests that the property of $f_{cls}^{plain}$ need to be "sharp" enough to differentiate between two similar features. That is to say, the output of $f_{cls}^{plain}$ changes repidly over short proids or distances, it implies that $f_{cls}^{plain}$ need to captures information with higher frequency. This issue is also discussed in \cite{o3d}. Capturing the high frequency with a plain structure is difficult because a naive MLP tends to capture information with lower frequency \cite{xu2022overview}. In the most extreme case, where $\boldsymbol{F}_{i}^{roi} = \boldsymbol{F}_{j}^{roi}$, it becomes impossible to distinguish the two anchors to positive and negative samples completely; in practice, both confidences converge to around 0.5. This problem arises from the limitations of the input format and the structure of the naive MLP, which restrict its expressive capability for information with higher frequency. Therefore, it is crucial to establish relationships between anchors and design a new model structure to effectively represent “sharp” information.
|
||||||
|
|
||||||
It is easy to see that the "ideal" one-to-one branch is equivalence to O2M cls branch with O2M regression and NMS postprocessing. If the NMS could be replaced by some equivalent but learnable functions (e.g. a neural work with specific structure), the O2O head could be trained to handle the one-to-one assignment. However, the NMS involves sequential iteration and confidence sorting, which are challenging to reproduce with a neural network. Although previous work, such as RNN-based approaches\cite{stewart2016end}. These methods are time-consuming and introduce additional complexity into the model training process due to their iterative nature. To eliminate the iteration process, we proposed a equivalent format of FastNMS\cite{yolact}.
|
It is easy to see that the "ideal" one-to-one branch is equivalence to O2M cls branch with O2M regression and NMS post-processing. If the NMS could be replaced by some equivalent but learnable functions (e.g. a neural network with specific structure), the O2O head could be trained to handle the one-to-one assignment. However, the NMS involves sequential iteration and confidence sorting, which are challenging to reproduce with a neural network. Although previous works, such as RNN-based approaches \cite{stewart2016end}, utilize an iterative format, they are time-consuming and introduce additional complexity into the model training process due to their iterative nature. To eliminate the iteration process, we proposed a equivalent format of Fast NMS\cite{yolact}.
|
||||||
|
|
||||||
The key rule of the NMS postprocessing is as follows:
|
The key rule of the NMS post-processing is as follows:
|
||||||
Given a series of positive detections with redundancy, detection lane A is supressed by another detection lane B if and only if:
|
Given a series of positive detections with redundancy, a detection result A is supressed by another detection result B if and only if:
|
||||||
|
|
||||||
(1) The confidence of A is lower than that of B.
|
(1) The confidence of A is lower than that of B.
|
||||||
|
|
||||||
(2) The predefined distance (e.g. IoU distance and L1 distance) between A and B is smaller than a threshold.
|
(2) The predefined distance (e.g. IoU distance and L1 distance) between A and B is smaller than a threshold.
|
||||||
|
|
||||||
(3) Detection lane B is not supressed by any other detections.
|
(3) B is not supressed by any other detection results.
|
||||||
|
|
||||||
For simplicity, Fast NMS only satisfies the condition (1) and (2), which may lead to an increase in false negative predictions but offers faster processing without sequential iteration. Leveraging the “iteration-free” property, we propose a further refinement called “sort-free” Fast NMS. This new approach, named Graph-based Fast NMS, is detailed in Algorithm \ref{Graph Fast NMS}.
|
For simplicity, Fast NMS only satisfies the condition (1) and (2), which may lead to an increase in false negative predictions but offers faster processing without sequential iteration. Leveraging the “iteration-free” property, we propose a further refinement called “sort-free” Fast NMS. This new approach, named Graph-based Fast NMS, is detailed in Algorithm \ref{Graph Fast NMS}.
|
||||||
|
|
||||||
It is straightforward to demonstrate that, when all elements in $\boldsymbol{M}$ are all set to 1 (regardless of geometric priors), Graph-based Fast NMS is equivalent to Fast NMS. Building upon our newly proposed Graph-based Fast NMS, we can design the structure of the one-to-one classification head in a manner that mirrors the principles of following Graph-based Fast NMS.
|
It is straightforward to demonstrate that, when all elements in $\boldsymbol{M}$ are all set to 1 (regardless of geometric priors), Graph-based Fast NMS is equivalent to Fast NMS. Building upon our newly proposed Graph-based Fast NMS, we can design the structure of the one-to-one classification head in a manner that mirrors the principles of following Graph-based Fast NMS.
|
||||||
|
|
||||||
According to the analysis of the shortcomings of traditional NMS postprocessing shown in Fig. \ref{nms setting}, the fundamental issue arises from the definition of the distance between predictions. Traditional NMS relies on geometric properties to define distances between predictions, which often neglects the contextual semantics. For example, in some scenarios, two predicted lanes with a small geometric distance should not be suppressed, such as in the case of double lines or fork lines. Although setting a threshold $d_{\tau}$ can mitigate this problem, it is challenging to strike a balance between precision and recall.
|
According to the analysis of the shortcomings of traditional NMS post-processing shown in Fig. \ref{NMS setting}, the fundamental issue arises from the definition of the distance between predictions. Traditional NMS relies on geometric properties to define distances between predictions, which often neglects the contextual semantics. For example, in some scenarios, two predicted lanes with a small geometric distance should not be suppressed, such as the case of double lines or fork lines. Although setting a threshold $d_{\tau}$ can mitigate this problem, it is challenging to strike a balance between precision and recall.
|
||||||
|
|
||||||
To address this, we replace the explicit definition of the distance function with an implicit graph neural network. Additionally, the coordinates of anchors is also replace with the anchor features ${F}_{i}^{roi}$. According to information bottleneck theory \cite{alemi2016deep}, ${F}_{i}^{roi}$ , which contains the location and classification information, is sufficient for modelling the explicit geometric distance by neural work. Besides the geometric information, features ${F}_{i}^{roi}$ containes the contextual information of an anchor, which provides additional clues for establishing implicit distances between two anchors. The implicit distance is expressed as follows:
|
To address this, we replace the explicit definition of the distance function with an implicit graph neural network. Additionally, the coordinates of anchors is also replace with the anchor features ${F}_{i}^{roi}$. According to information bottleneck theory \cite{alemi2016deep}, ${F}_{i}^{roi}$ , which contains the location and classification information, is sufficient for modelling the explicit geometric distance by neural network. Besides the geometric information, features ${F}_{i}^{roi}$ containes the implicit contextual information of an anchor, which provides additional clues for establishing implicit contextual distances between two anchors. The implicit contextual distance is calculated as follows:
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
\tilde{\boldsymbol{F}}_{i}^{roi}\gets& \mathrm{Re}LU\left( FC_{o2o,roi}\left( \boldsymbol{F}_{i}^{roi} \right) \right)
|
\tilde{\boldsymbol{F}}_{i}^{roi}\gets& \mathrm{Re}LU\left( FC_{o2o}^{roi}\left( \boldsymbol{F}_{i}^{roi} \right) \right)
|
||||||
\\
|
\\
|
||||||
\boldsymbol{F}_{ij}^{edge}\gets& FC_{in}\left( \tilde{\boldsymbol{F}}_{i}^{roi} \right) -FC_{out}\left( \tilde{\boldsymbol{F}}_{i}^{roi} \right)
|
\boldsymbol{F}_{ij}^{edge}\gets& FC_{in}\left( \tilde{\boldsymbol{F}}_{i}^{roi} \right) -FC_{out}\left( \tilde{\boldsymbol{F}}_{i}^{roi} \right)
|
||||||
\\
|
\\
|
||||||
@ -427,7 +427,7 @@ To address this, we replace the explicit definition of the distance function wit
|
|||||||
\label{edge_layer}
|
\label{edge_layer}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
Equation \ref{edge_layer} represents the implicit expression of equation \ref{al_1-3}, where the distance $\boldsymbol{D}_{ij}^{edge}$ is no longer a scalar but a semantic tensor with dimension $d_{dis}$. $\boldsymbol{D}_{ij}^{edge}$ containes more complex information ompared to traditional geometric distance. The confidence caculation is expressed as follows:
|
Equation \ref{edge_layer} represents the implicit expression of equation \ref{al_1-3}, where the distance $\boldsymbol{D}_{ij}^{edge}$ is no longer a scalar but a semantic tensor with dimension $d_{dis}$. $\boldsymbol{D}_{ij}^{edge}$ containes more complex information compared to traditional geometric distance. The confidence caculation is expressed as follows:
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
@ -454,7 +454,7 @@ It should be noted that the O2O cls head depends on the predictons of O2M cls he
|
|||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
|
|
||||||
\textbf{Label assignment and Cost function} We use the label assignment (SimOTA) similar to previous work \cite{clrnet}\cite{clrernet}. However, to make the function more compact and consistent with general object detection works \cite{iouloss}\cite{giouloss}, we have redefined the lane IoU. As illustrated in Fig. \ref{glaneiou}, the newly-defined lane IoU, which we refer to as GLaneIoU, is redefined as follows:
|
\textbf{Label assignment and Cost function} We use the label assignment (SimOTA) similar to previous works \cite{clrnet}\cite{clrernet}. However, to make the function more compact and consistent with general object detection works \cite{iouloss}\cite{giouloss}, we have redefined the lane IoU. As illustrated in Fig. \ref{glaneiou}, the newly-defined lane IoU, which we refer to as GLaneIoU, is redefined as follows:
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\centering
|
\centering
|
||||||
@ -464,7 +464,7 @@ It should be noted that the O2O cls head depends on the predictons of O2M cls he
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
&w_{i}^{k}=\frac{\sqrt{\left( \Delta x_{i}^{k} \right) ^2+\left( \Delta y_{i}^{k} \right) ^2}}{\Delta y_{i}^{k}}w
|
&w_{i}^{k}=\frac{\sqrt{\left( \Delta x_{i}^{k} \right) ^2+\left( \Delta y_{i}^{k} \right) ^2}}{\Delta y_{i}^{k}}w_{b}
|
||||||
\\
|
\\
|
||||||
&\hat{d}_{i}^{\mathcal{O}}=\min \left( x_{i}^{p}+w_{i}^{p}, x_{i}^{q}+w_{i}^{q} \right) -\max \left( x_{i}^{p}-w_{i}^{p}, x_{i}^{q}-w_{i}^{q} \right)
|
&\hat{d}_{i}^{\mathcal{O}}=\min \left( x_{i}^{p}+w_{i}^{p}, x_{i}^{q}+w_{i}^{q} \right) -\max \left( x_{i}^{p}-w_{i}^{p}, x_{i}^{q}-w_{i}^{q} \right)
|
||||||
\\
|
\\
|
||||||
@ -482,21 +482,21 @@ The definations of $d_{i}^{\mathcal{O}}$ and $d_{i}^{\mathcal{\xi}}$ is similar
|
|||||||
GLaneIoU\,\,=\,\,\frac{\sum\nolimits_{i=j}^k{d_{i}^{\mathcal{O}}}}{\sum\nolimits_{i=j}^k{d_{i}^{\mathcal{U}}}}-g\frac{\sum\nolimits_{i=j}^k{d_{i}^{\xi}}}{\sum\nolimits_{i=j}^k{d_{i}^{\mathcal{U}}}}
|
GLaneIoU\,\,=\,\,\frac{\sum\nolimits_{i=j}^k{d_{i}^{\mathcal{O}}}}{\sum\nolimits_{i=j}^k{d_{i}^{\mathcal{U}}}}-g\frac{\sum\nolimits_{i=j}^k{d_{i}^{\xi}}}{\sum\nolimits_{i=j}^k{d_{i}^{\mathcal{U}}}}
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
where j and k are the indices of the valid points (the start point and the end point). It's straightforward to observed that when $g=0$, the GLaneIoU is correspond to IoU for bounding box, with a value range of $\left[0, 1 \right]$. When $g=1$, the GLaneIoU is correspond to GIoU for bounding box, with a value range of $\left(-1, 1 \right]$. In general, when $g>0$, the value range of GLaneIoU is $\left(-g, 1 \right]$.
|
where j and k are the indices of the valid points (the start point and the end point). It's straightforward to observed that when $g=0$, the GLaneIoU is correspond to GIoU\cite{giouloss} for bounding box, with a value range of $\left[0, 1 \right]$. When $g=1$, the GLaneIoU is correspond to GIoU for bounding box, with a value range of $\left(-1, 1 \right]$. In general, when $g>0$, the value range of GLaneIoU is $\left(-g, 1 \right]$.
|
||||||
We then define the cost function between $i_{th}$ prediction and $j_{th}$ ground truth as follows like \cite{detr}:
|
We then define the cost function between $i_{th}$ prediction and $j_{th}$ ground truth as follows like \cite{detr}:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
\mathcal{C} _{ij}=\left(s_i\right)^{\beta_c}\times \left( GLaneIoU_{ij, g=0} \right) ^{\beta_r}
|
\mathcal{C} _{ij}=\left(s_i\right)^{\beta_c}\times \left( GLaneIoU_{ij, g=0} \right) ^{\beta_r}
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
This cost function is more compact than those in previous work and takes both location and confidence into account. For label assignment, SimOTA (with k=4) \cite{yolox} is used for the two O2M heads with one-to-many assignment, while the Hungarian \cite{detr} algorithm is employed for the O2O classification head for one-to-one assignment.
|
This cost function is more compact than those in previous works\cite{clrnet}\cite{adnet} and takes both location and confidence into account. For label assignment, SimOTA (with k=4) \cite{yolox} is used for the two O2M heads with one-to-many assignment, while the Hungarian \cite{detr} algorithm is employed for the O2O classification head for one-to-one assignment.
|
||||||
|
|
||||||
\textbf{Loss function} We use focal loss \cite{focal} for O2O cls head and O2M cls head:
|
\textbf{Loss function} We use focal loss \cite{focal} for O2O cls head and O2M cls head:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
\mathcal{L} _{\,\,o2m,cls}&=\sum_{i\in \varOmega _{pos}^{o2m}}{\alpha _{o2m}\left( 1-s_i \right) ^{\gamma}\log \left( s_i \right)}\\&+\sum_{i\in \varOmega _{neg}^{o2m}}{\left( 1-\alpha _{o2m} \right) \left( s_i \right) ^{\gamma}\log \left( 1-s_i \right)}
|
\mathcal{L} _{\,\,o2m}^{cls}&=\sum_{i\in \varOmega _{pos}^{o2m}}{\alpha _{o2m}\left( 1-s_i \right) ^{\gamma}\log \left( s_i \right)}\\&+\sum_{i\in \varOmega _{neg}^{o2m}}{\left( 1-\alpha _{o2m} \right) \left( s_i \right) ^{\gamma}\log \left( 1-s_i \right)}
|
||||||
\\
|
\\
|
||||||
\mathcal{L} _{\,\,o2o,cls}&=\sum_{i\in \varOmega _{pos}^{o2o}}{\alpha _{o2o}\left( 1-\tilde{s}_i \right) ^{\gamma}\log \left( \tilde{s}_i \right)}\\&+\sum_{i\in \varOmega _{neg}^{o2o}}{\left( 1-\alpha _{o2o} \right) \left( \tilde{s}_i \right) ^{\gamma}\log \left( 1-\tilde{s}_i \right)}
|
\mathcal{L} _{\,\,o2o}^{cls}&=\sum_{i\in \varOmega _{pos}^{o2o}}{\alpha _{o2o}\left( 1-\tilde{s}_i \right) ^{\gamma}\log \left( \tilde{s}_i \right)}\\&+\sum_{i\in \varOmega _{neg}^{o2o}}{\left( 1-\alpha _{o2o} \right) \left( \tilde{s}_i \right) ^{\gamma}\log \left( 1-\tilde{s}_i \right)}
|
||||||
\\
|
\\
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
@ -506,7 +506,7 @@ where the set of the one-to-one sample, $\varOmega _{pos}^{o2o}$ and $\varOmega
|
|||||||
\varOmega _{pos}^{o2o}\cup \varOmega _{neg}^{o2o}=\left\{ i|s_i>C_{o2m} \right\}
|
\varOmega _{pos}^{o2o}\cup \varOmega _{neg}^{o2o}=\left\{ i|s_i>C_{o2m} \right\}
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
only one sample with confidence larger than $C_{o2m}$ is chosed as the canditate sample of O2O cls head. According to \cite{pss}, to maintain feature quality during training stage, the gradient of O2O cls head are stopped from propagating back to the rest of the network (the roi feature of the anchor $\boldsymbol{F}_{i}^{roi}$). Additionally, we use the rank loss to increase the gap between positive and negative confidences of O2O cls head:
|
only one sample with confidence larger than $C_{o2m}$ is chosed as the canditate sample of O2O cls head. According to \cite{pss}, to maintain feature quality during training stage, the gradient of O2O cls head are stopped from propagating back to the rest of the network (stop from the roi feature of the anchor $\boldsymbol{F}_{i}^{roi}$). Additionally, we use the rank loss to increase the gap between positive and negative confidences of O2O cls head:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
&\mathcal{L} _{\,\,rank}=\frac{1}{N_{rank}}\sum_{i\in \varOmega _{pos}^{o2o}}{\sum_{j\in \varOmega _{neg}^{o2o}}{\max \left( 0, \tau _{rank}-\tilde{s}_i+\tilde{s}_j \right)}}\\
|
&\mathcal{L} _{\,\,rank}=\frac{1}{N_{rank}}\sum_{i\in \varOmega _{pos}^{o2o}}{\sum_{j\in \varOmega _{neg}^{o2o}}{\max \left( 0, \tau _{rank}-\tilde{s}_i+\tilde{s}_j \right)}}\\
|
||||||
@ -514,18 +514,18 @@ only one sample with confidence larger than $C_{o2m}$ is chosed as the canditate
|
|||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
We directly use the GLaneIoU loss, $\mathcal{L}_{GLaneIoU}$, to regression the offset of xs (with g=1) and SmoothL1 loss for the regression of end points (namely the y axis of the start point and the end point), denoted as $\mathcal{L} _{end}$. In order to make model learn the global features, we proposed the auxloss illustrated in fig. \ref{auxloss}:
|
We directly use the GLaneIoU loss, $\mathcal{L}_{GLaneIoU}$, to regression the offset of xs (with g=1) and Smooth-L1 loss for the regression of end points (namely the y axis of the start point and the end point), denoted as $\mathcal{L} _{end}$. In order to make model learn the global features, we proposed the auxloss illustrated in fig. \ref{auxloss}:
|
||||||
\begin{align}
|
\begin{align}
|
||||||
\mathcal{L}_{aux} &= \frac{1}{\left| \varOmega_{pos}^{o2m} \right| N_{seg}} \sum_{i \in \varOmega_{pos}^{o2o}} \sum_{m=j}^k \Bigg[ l \left( \theta_i - \hat{\theta}_{i}^{seg,m} \right) \\
|
\mathcal{L}_{aux} &= \frac{1}{\left| \varOmega_{pos}^{o2m} \right| N_{seg}} \sum_{i \in \varOmega_{pos}^{o2o}} \sum_{m=j}^k \Bigg[ l \left( \theta_i - \hat{\theta}_{i}^{seg,m} \right) \\
|
||||||
&\quad + l \left( r_{i}^{global} - \hat{r}_{i}^{seg,m} \right) \Bigg]
|
&\quad + l \left( r_{i}^{global} - \hat{r}_{i}^{seg,m} \right) \Bigg]
|
||||||
\end{align}
|
\end{align}
|
||||||
The anchors and ground truth are divided into several segments. Each anchor segment is regressed to the main components of the corresponding segment of the assigned ground truth. This approach assists the anchors in learning more about the global geometric shape.
|
The anchors and ground truth are divided into several segments. Each anchor segment is regressed to the main components of the corresponding segment of the assigned ground truth. This trick assists the anchors in learning more about the global geometric shape.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
\subsection{Loss function}
|
\subsection{Loss function}
|
||||||
|
|
||||||
The overall loss function of PolarRCNN is given as follows:
|
The overall loss function of Polar R-CNN is given as follows:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
\mathcal{L}_{overall} &=\mathcal{L} _{lph}^{cls}+w_{lph}^{reg}\mathcal{L} _{lph}^{reg}\\&+w_{o2m}^{cls}\mathcal{L} _{o2m}^{cls}+w_{o2o}^{cls}\mathcal{L} _{o2o}^{cls}+w_{rank}\mathcal{L} _{rank}\\&+w_{IoU}\mathcal{L} _{IoU}+w_{end}\mathcal{L} _{end}+w_{aux}\mathcal{L} _{aux}
|
\mathcal{L}_{overall} &=\mathcal{L} _{lph}^{cls}+w_{lph}^{reg}\mathcal{L} _{lph}^{reg}\\&+w_{o2m}^{cls}\mathcal{L} _{o2m}^{cls}+w_{o2o}^{cls}\mathcal{L} _{o2o}^{cls}+w_{rank}\mathcal{L} _{rank}\\&+w_{IoU}\mathcal{L} _{IoU}+w_{end}\mathcal{L} _{end}+w_{aux}\mathcal{L} _{aux}
|
||||||
@ -565,8 +565,8 @@ The first line in the loss function represents the loss for the local polar head
|
|||||||
& $w_{rank}$ &0.7&0.7&0.1&0.7&0 \\
|
& $w_{rank}$ &0.7&0.7&0.1&0.7&0 \\
|
||||||
\midrule
|
\midrule
|
||||||
\multirow{4}*{Evaluation Hyperparameter}
|
\multirow{4}*{Evaluation Hyperparameter}
|
||||||
& $H^{L}\times W^{L}$ &$4\times10$&$4\times10$&$4\times10$&$4\times10$&$6\times13$\\
|
& $H^{l}\times W^{l}$ &$4\times10$&$4\times10$&$4\times10$&$4\times10$&$6\times13$\\
|
||||||
& $K_{A}$ &20&20&20&12&50\\
|
& $K_{a}$ &20&20&20&12&50\\
|
||||||
& $C_{O2M}$ &0.48&0.40&0.40&0.40&0.45\\
|
& $C_{O2M}$ &0.48&0.40&0.40&0.40&0.45\\
|
||||||
& $C_{O2O}$ &0.46&0.46&0.46&0.46&0.44\\
|
& $C_{O2O}$ &0.46&0.46&0.46&0.46&0.44\\
|
||||||
\bottomrule
|
\bottomrule
|
||||||
@ -587,7 +587,7 @@ The first line in the loss function represents the loss for the local polar head
|
|||||||
\subsection{Dataset and Evaluation Metric}
|
\subsection{Dataset and Evaluation Metric}
|
||||||
We conducted experiments on four widely used lane detection benchmarks and one rail detection dataset: CULane\cite{scnn}, TuSimple\cite{tusimple}, LLAMAS\cite{llamas}, CurveLanes\cite{curvelanes}, and DL-Rail\cite{dalnet}. Among these datasets, CULane and CurveLanes are particularly challenging. The CULane dataset consists various scenarios but has sparse lane distributions, whereas CurveLanes includes a large number of curved and dense lane types, such as forked and double lanes. The DL-Rail dataset, focused on rail detection across different scenarios, was chosen to evaluate our model’s performance beyond traditional lane detection. The details for five dataset are shown in Table. \ref{dataset_info}
|
We conducted experiments on four widely used lane detection benchmarks and one rail detection dataset: CULane\cite{scnn}, TuSimple\cite{tusimple}, LLAMAS\cite{llamas}, CurveLanes\cite{curvelanes}, and DL-Rail\cite{dalnet}. Among these datasets, CULane and CurveLanes are particularly challenging. The CULane dataset consists various scenarios but has sparse lane distributions, whereas CurveLanes includes a large number of curved and dense lane types, such as forked and double lanes. The DL-Rail dataset, focused on rail detection across different scenarios, was chosen to evaluate our model’s performance beyond traditional lane detection. The details for five dataset are shown in Table. \ref{dataset_info}
|
||||||
|
|
||||||
We use the F1-score to evaluate our model on the CULane, LLAMAS, DL-Rail, and Curvelanes datasets, maintaining consistency with previous work. The F1-score is defined as follows:
|
We use the F1-score to evaluate our model on the CULane, LLAMAS, DL-Rail, and Curvelanes datasets, maintaining consistency with previous works. The F1-score is defined as follows:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
F1=\frac{2\times Precision\times Recall}{Precision\,\,+\,\,Recall}
|
F1=\frac{2\times Precision\times Recall}{Precision\,\,+\,\,Recall}
|
||||||
@ -597,7 +597,7 @@ We use the F1-score to evaluate our model on the CULane, LLAMAS, DL-Rail, and Cu
|
|||||||
Recall\,\,=\,\,\frac{TP}{TP+FN}
|
Recall\,\,=\,\,\frac{TP}{TP+FN}
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
In our experiment, we use different IoU thresholds to calculate the F1-score for different datasets: F1@50 and F1@75 for CULane \cite{clrnet}, F1@50 for LLAMAS \cite{clrnet} and Curvelanes \cite{condlanenet}, and F1@50, F1@75, and mF1 for DL-Rail \cite{dalnet}. The mF1 is defined as:
|
In our experiment, we use different IoU thresholds to calculate the F1-score for different datasets: F1@50 and F1@75 for CULane \cite{clrnet}, F1@50 for LLAMAS \cite{clrnet} and Curvelanes \cite{CondLaneNet}, and F1@50, F1@75, and mF1 for DL-Rail \cite{dalnet}. The mF1 is defined as:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
mF1=\left( F1@50+F1@55+...+F1@95 \right) /10
|
mF1=\left( F1@50+F1@55+...+F1@95 \right) /10
|
||||||
@ -631,7 +631,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
|
|||||||
RESA\cite{resa} &ResNet50 &75.30&53.39&92.10&73.10&69.20&72.80&47.70&83.30&70.30&1503&69.90\\
|
RESA\cite{resa} &ResNet50 &75.30&53.39&92.10&73.10&69.20&72.80&47.70&83.30&70.30&1503&69.90\\
|
||||||
LaneAF\cite{laneaf} &DLA34 &77.41&- &91.80&75.61&71.78&79.12&51.38&86.88&72.70&1360&73.03\\
|
LaneAF\cite{laneaf} &DLA34 &77.41&- &91.80&75.61&71.78&79.12&51.38&86.88&72.70&1360&73.03\\
|
||||||
UFLDv2\cite{ufldv2} &ResNet34 &76.0 &- &92.5 &74.8 &65.5 &75.5 &49.2 &88.8 &70.1 &1910&70.8 \\
|
UFLDv2\cite{ufldv2} &ResNet34 &76.0 &- &92.5 &74.8 &65.5 &75.5 &49.2 &88.8 &70.1 &1910&70.8 \\
|
||||||
CondLaneNet\cite{condlanenet} &ResNet101&79.48&61.23&93.47&77.44&70.93&80.91&54.13&90.16&75.21&1201&74.80\\
|
CondLaneNet\cite{CondLaneNet} &ResNet101&79.48&61.23&93.47&77.44&70.93&80.91&54.13&90.16&75.21&1201&74.80\\
|
||||||
\cline{1-1}
|
\cline{1-1}
|
||||||
\textbf{Parameter} \\
|
\textbf{Parameter} \\
|
||||||
\cline{1-1}
|
\cline{1-1}
|
||||||
@ -661,12 +661,12 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
|
|||||||
\hline
|
\hline
|
||||||
\textbf{Proposed Method} \\
|
\textbf{Proposed Method} \\
|
||||||
\cline{1-1}
|
\cline{1-1}
|
||||||
PolarRCNN-NMS &ResNet18&80.81&63.96&94.12&79.57&76.53&83.33&55.06&90.62&79.50&1088&75.25\\
|
Polar R-CNN-NMS &ResNet18&80.81&63.96&94.12&79.57&76.53&83.33&55.06&90.62&79.50&1088&75.25\\
|
||||||
PolarRCNN &ResNet18&80.81&63.96&94.12&79.57&76.53&83.33&55.06&90.62&79.50&1088&75.25\\
|
Polar R-CNN &ResNet18&80.81&63.96&94.12&79.57&76.53&83.33&55.06&90.62&79.50&1088&75.25\\
|
||||||
PolarRCNN &ResNet34&80.92&63.97&94.24&79.76&76.70&81.93&55.40&\textbf{91.12}&79.85&1158&75.71\\
|
Polar R-CNN &ResNet34&80.92&63.97&94.24&79.76&76.70&81.93&55.40&\textbf{91.12}&79.85&1158&75.71\\
|
||||||
PolarRCNN &ResNet50&81.34&64.77&94.45&\textbf{80.42}&75.82&83.61&56.62&91.10&80.05&1356&75.94\\
|
Polar R-CNN &ResNet50&81.34&64.77&94.45&\textbf{80.42}&75.82&83.61&56.62&91.10&80.05&1356&75.94\\
|
||||||
PolarRCNN-NMS &DLA34 &\textbf{81.49}&64.96&\textbf{94.44}&80.36&\textbf{76.83}&83.68&56.53&90.85&\textbf{80.09}&1135&76.32\\
|
Polar R-CNN-NMS &DLA34 &\textbf{81.49}&64.96&\textbf{94.44}&80.36&\textbf{76.83}&83.68&56.53&90.85&\textbf{80.09}&1135&76.32\\
|
||||||
PolarRCNN &DLA34 &\textbf{81.49}&\textbf{64.97}&\textbf{94.44}&80.36&\textbf{76.79}&83.68&\textbf{56.52}&90.85&\textbf{80.09}&1133&76.32\\
|
Polar R-CNN &DLA34 &\textbf{81.49}&\textbf{64.97}&\textbf{94.44}&80.36&\textbf{76.79}&83.68&\textbf{56.52}&90.85&\textbf{80.09}&1133&76.32\\
|
||||||
\bottomrule
|
\bottomrule
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
\end{adjustbox}
|
\end{adjustbox}
|
||||||
@ -690,11 +690,11 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
|
|||||||
UFLDv2\cite{ufld} &ResNet34 &88.08&95.73&18.84&3.70\\
|
UFLDv2\cite{ufld} &ResNet34 &88.08&95.73&18.84&3.70\\
|
||||||
LaneATT\cite{laneatt} &ResNet34 &95.63&96.77&3.53&2.92\\
|
LaneATT\cite{laneatt} &ResNet34 &95.63&96.77&3.53&2.92\\
|
||||||
FOLOLane\cite{laneatt} &ERFNet &\textbf{96.92}&96.59&4.47&2.28\\
|
FOLOLane\cite{laneatt} &ERFNet &\textbf{96.92}&96.59&4.47&2.28\\
|
||||||
CondLaneNet\cite{condlanenet}&ResNet101 &96.54&97.24&2.01&3.50\\
|
CondLaneNet\cite{CondLaneNet}&ResNet101 &96.54&97.24&2.01&3.50\\
|
||||||
CLRNet\cite{clrnet} &ResNet18 &96.84&97.89&2.28&1.92\\
|
CLRNet\cite{clrnet} &ResNet18 &96.84&97.89&2.28&1.92\\
|
||||||
\midrule
|
\midrule
|
||||||
PolarRCNN-NMS &ResNet18&96.21&\textbf{97.98}&2.17&1.86\\
|
Polar R-CNN-NMS &ResNet18&96.21&\textbf{97.98}&2.17&1.86\\
|
||||||
PolarRCNN &ResNet18&96.20&97.94&2.25&1.87\\
|
Polar R-CNN &ResNet18&96.20&97.94&2.25&1.87\\
|
||||||
\bottomrule
|
\bottomrule
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
\end{adjustbox}
|
\end{adjustbox}
|
||||||
@ -718,10 +718,10 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
|
|||||||
CLRNet\cite{clrnet} &DLA34 &96.12&- &- \\
|
CLRNet\cite{clrnet} &DLA34 &96.12&- &- \\
|
||||||
\midrule
|
\midrule
|
||||||
|
|
||||||
PolarRCNN-NMS &ResNet18&96.05&96.80&95.32\\
|
Polar R-CNN-NMS &ResNet18&96.05&96.80&95.32\\
|
||||||
PolarRCNN &ResNet18&96.06&96.81&95.32\\
|
Polar R-CNN &ResNet18&96.06&96.81&95.32\\
|
||||||
PolarRCNN-NMS &DLA34&96.13&96.80&\textbf{95.47}\\
|
Polar R-CNN-NMS &DLA34&96.13&96.80&\textbf{95.47}\\
|
||||||
PolarRCNN &DLA34&\textbf{96.14}&96.82&\textbf{95.47}\\
|
Polar R-CNN &DLA34&\textbf{96.14}&96.82&\textbf{95.47}\\
|
||||||
|
|
||||||
\bottomrule
|
\bottomrule
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
@ -739,13 +739,13 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
|
|||||||
\midrule
|
\midrule
|
||||||
BézierLaneNet\cite{bezierlanenet} &ResNet18&42.81&85.13&38.62\\
|
BézierLaneNet\cite{bezierlanenet} &ResNet18&42.81&85.13&38.62\\
|
||||||
GANet-S\cite{ganet} &Resnet18&57.64&95.68&62.01\\
|
GANet-S\cite{ganet} &Resnet18&57.64&95.68&62.01\\
|
||||||
CondLaneNet\cite{condlanenet} &Resnet18&52.37&95.10&53.10\\
|
CondLaneNet\cite{CondLaneNet} &Resnet18&52.37&95.10&53.10\\
|
||||||
UFLDv1\cite{ufld} &ResNet34&53.76&94.78&57.15\\
|
UFLDv1\cite{ufld} &ResNet34&53.76&94.78&57.15\\
|
||||||
LaneATT(with RPN)\cite{dalnet} &ResNet18&55.57&93.82&58.97\\
|
LaneATT(with RPN)\cite{dalnet} &ResNet18&55.57&93.82&58.97\\
|
||||||
DALNet\cite{dalnet} &ResNet18&59.79&96.43&65.48\\
|
DALNet\cite{dalnet} &ResNet18&59.79&96.43&65.48\\
|
||||||
\midrule
|
\midrule
|
||||||
PolarRCNN-NMS &ResNet18&\textbf{61.53}&\textbf{97.01}&\textbf{67.86}\\
|
Polar R-CNN-NMS &ResNet18&\textbf{61.53}&\textbf{97.01}&\textbf{67.86}\\
|
||||||
PolarRCNN &ResNet18&61.52&96.99&67.85\\
|
Polar R-CNN &ResNet18&61.52&96.99&67.85\\
|
||||||
\bottomrule
|
\bottomrule
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
\end{adjustbox}
|
\end{adjustbox}
|
||||||
@ -768,12 +768,12 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
|
|||||||
CurveLane-M\cite{curvelanes} &- &81.80&93.49&72.71\\
|
CurveLane-M\cite{curvelanes} &- &81.80&93.49&72.71\\
|
||||||
CurveLane-L\cite{curvelanes} &- &82.29&91.11&75.03\\
|
CurveLane-L\cite{curvelanes} &- &82.29&91.11&75.03\\
|
||||||
UFLDv2\cite{ufldv2} &ResNet34 &81.34&81.93&80.76\\
|
UFLDv2\cite{ufldv2} &ResNet34 &81.34&81.93&80.76\\
|
||||||
CondLaneNet-M\cite{condlanenet} &ResNet34 &85.92&88.29&83.68\\
|
CondLaneNet-M\cite{CondLaneNet} &ResNet34 &85.92&88.29&83.68\\
|
||||||
CondLaneNet-L\cite{condlanenet} &ResNet101&86.10&88.98&83.41\\
|
CondLaneNet-L\cite{CondLaneNet} &ResNet101&86.10&88.98&83.41\\
|
||||||
CLRNet\cite{clrnet} &DLA34 &86.10&91.40&81.39\\
|
CLRNet\cite{clrnet} &DLA34 &86.10&91.40&81.39\\
|
||||||
CLRerNet\cite{clrernet} &DLA34 &86.47&91.66&81.83\\
|
CLRerNet\cite{clrernet} &DLA34 &86.47&91.66&81.83\\
|
||||||
\hline
|
\hline
|
||||||
PolarRCNN &DLA34&\textbf{87.29}&90.50&\textbf{84.31}\\
|
Polar R-CNN &DLA34&\textbf{87.29}&90.50&\textbf{84.31}\\
|
||||||
\hline
|
\hline
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
\end{adjustbox}
|
\end{adjustbox}
|
||||||
@ -781,7 +781,7 @@ All input images are cropped and resized to $800\times320$. Similar to \cite{clr
|
|||||||
\end{table}
|
\end{table}
|
||||||
|
|
||||||
\subsection{Comparison with the state-of-the-art results}
|
\subsection{Comparison with the state-of-the-art results}
|
||||||
The comparison results of our proposed model with other methods are shown in Tables \ref{culane result}, \ref{tusimple result}, \ref{llamas result}, \ref{dlrail result}, and \ref{curvelanes result}. We present results for two versions of our model: the NMS-based version, denoted as PolarRCNN-NMS, and the NMS-free version, denoted as PolarRCNN. The NMS-based version utilizes predictions obtained from the O2M head followed by NMS post-processing, while the NMS-free version derives predictions directly from the O2O classification head without NMS.
|
The comparison results of our proposed model with other methods are shown in Tables \ref{culane result}, \ref{tusimple result}, \ref{llamas result}, \ref{dlrail result}, and \ref{curvelanes result}. We present results for two versions of our model: the NMS-based version, denoted as Polar R-CNN-NMS, and the NMS-free version, denoted as Polar R-CNN. The NMS-based version utilizes predictions obtained from the O2M head followed by NMS post-processing, while the NMS-free version derives predictions directly from the O2O classification head without NMS.
|
||||||
|
|
||||||
To ensure a fair comparison, we also include results for CLRerNet \cite{clrernet} on the CULane and CurveLanes datasets, as we use a similar training strategy and data split. As illustrated in the comparison results, our model demonstrates competitive performance across five datasets. Specifically, on the CULane, TuSimple, LLAMAS, and DL-Rail datasets (sparse scenarios), our model outperforms other anchor-based methods. Additionally, the performance of the NMS-free version is nearly identical to that of the NMS-based version, highlighting the effectiveness of the O2O head in eliminating redundant predictions. On the CurveLanes dataset, the NMS-free version achieves superior F1-measure and Recall compared to both NMS-based and segment\&grid-based methods.
|
To ensure a fair comparison, we also include results for CLRerNet \cite{clrernet} on the CULane and CurveLanes datasets, as we use a similar training strategy and data split. As illustrated in the comparison results, our model demonstrates competitive performance across five datasets. Specifically, on the CULane, TuSimple, LLAMAS, and DL-Rail datasets (sparse scenarios), our model outperforms other anchor-based methods. Additionally, the performance of the NMS-free version is nearly identical to that of the NMS-based version, highlighting the effectiveness of the O2O head in eliminating redundant predictions. On the CurveLanes dataset, the NMS-free version achieves superior F1-measure and Recall compared to both NMS-based and segment\&grid-based methods.
|
||||||
|
|
||||||
@ -803,11 +803,11 @@ We also compare the number of anchors and processing speed with other methods. F
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\subsection{Ablation Study and Visualization}
|
\subsection{Ablation Study and Visualization}
|
||||||
To validate and analyze the effectiveness and influence of different component of PolarRCNN, we conduct serveral ablation expeoriments on CULane and CurveLanes dataset to show the performance.
|
To validate and analyze the effectiveness and influence of different component of Polar R-CNN, we conduct serveral ablation expeoriments on CULane and CurveLanes dataset to show the performance.
|
||||||
|
|
||||||
\textbf{Ablation study on polar coordinate system and anchor number.} To assess the importance of local polar coordinates of anchors, we examine the contribution of each component (i.e., angle and radius) to model performance. As shown in Table \ref{aba_lph}, both angle and radius contribute to performance to varying degrees. Additionally, we conduct experiments with auxiliary loss using fixed anchors and PolarRCNN. Fixed anchors refer to using anchor settings trained by CLRNet, as illustrated in Fig. \ref{anchor setting} (b). Model performance improves by 0.48% and 0.3% under the fixed anchor paradigm and proposal anchor paradigm, respectively.
|
\textbf{Ablation study on polar coordinate system and anchor number.} To assess the importance of local polar coordinates of anchors, we examine the contribution of each component (i.e., angle and radius) to model performance. As shown in Table \ref{aba_lph}, both angle and radius contribute to performance to varying degrees. Additionally, we conduct experiments with auxiliary loss using fixed anchors and Polar R-CNN. Fixed anchors refer to using anchor settings trained by CLRNet, as illustrated in Fig. \ref{anchor setting} (b). Model performance improves by 0.48% and 0.3% under the fixed anchor paradigm and proposal anchor paradigm, respectively.
|
||||||
|
|
||||||
We also explore the effect of different local polar map sizes on our model, as illustrated in Fig. \ref{anchor_num_testing}. The overall F1 measure improves with increasing local polar map size and tends to stabilize when the size is sufficiently large. Specifically, precision improves, while recall decreases. A larger polar map size includes more background anchors in the second stage (since we choose k=4 for SimOTA, with no more than four positive samples). Consequently, the model learns more negative samples, enhancing precision but reducing recall. Regarding the number of anchors chosen during the evaluation stage, recall and F1 measure show a significant increase in the early stages of anchor number expansion but stabilize in later stages. This suggests that eliminating some anchors does not significantly affect performance. Fig. \ref{cam} displays the heat map and top-$k_{A}$ selected anchors’ distribution in sparse scenarios. Brighter colors indicate a higher likelihood of anchors being foreground anchors. It is evident that most of the proposed anchors are clustered around the lane ground truth.
|
We also explore the effect of different local polar map sizes on our model, as illustrated in Fig. \ref{anchor_num_testing}. The overall F1 measure improves with increasing local polar map size and tends to stabilize when the size is sufficiently large. Specifically, precision improves, while recall decreases. A larger polar map size includes more background anchors in the second stage (since we choose k=4 for SimOTA, with no more than four positive samples). Consequently, the model learns more negative samples, enhancing precision but reducing recall. Regarding the number of anchors chosen during the evaluation stage, recall and F1 measure show a significant increase in the early stages of anchor number expansion but stabilize in later stages. This suggests that eliminating some anchors does not significantly affect performance. Fig. \ref{cam} displays the heat map and top-$K_{a}$ selected anchors’ distribution in sparse scenarios. Brighter colors indicate a higher likelihood of anchors being foreground anchors. It is evident that most of the proposed anchors are clustered around the lane ground truth.
|
||||||
|
|
||||||
\begin{table}[h]
|
\begin{table}[h]
|
||||||
\centering
|
\centering
|
||||||
@ -881,15 +881,15 @@ We also explore the effect of different local polar map sizes on our model, as i
|
|||||||
\label{cam}
|
\label{cam}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\textbf{Ablation study on NMS-free block in sparse scenrios.} We conduct several experiments on the CULane dataset to evaluate the performance of the NMS-free head in sparse scenarios. As shown in Table \ref{aba_nmsfree_block}, without using the GNN to establish relationships between anchors, PolarRCNN fails to achieve an NMS-free paradigm, even with one-to-one assignment. Furthermore, the classification matrix (cls matrix) proves crucial, indicating that conditional probability is effective. Other components, such as the neighbor matrix (provided as a geometric prior) and rank loss, also contribute to the performance of the NMS-free block.
|
\textbf{Ablation study on NMS-free block in sparse scenarios.} We conduct several experiments on the CULane dataset to evaluate the performance of the NMS-free head in sparse scenarios. As shown in Table \ref{aba_NMSfree_block}, without using the GNN to establish relationships between anchors, Polar R-CNN fails to achieve a NMS-free paradigm, even with one-to-one assignment. Furthermore, the classification matrix (cls matrix) proves crucial, indicating that conditional probability is effective. Other components, such as the neighbor matrix (provided as a geometric prior) and rank loss, also contribute to the performance of the NMS-free block.
|
||||||
|
|
||||||
To compare the NMS-free paradigm with the traditional NMS paradigm, we perform experiments with the NMS-free block under both proposal and fixed anchor strategies. Table \ref{nms vs nmsfree} presents the results of these experiments. Here, O2M-B refers to the O2M classification head, O2O-B refers to the O2O classification head with a plain structure, and O2O-G refers to the O2O classification head with our proposed GNN structure. To assess the ability to eliminate redundant predictions, NMS post-processing is applied to each head. The results show that NMS is necessary for the traditional O2M classification head. In the fixed anchor paradigm, although the O2O classification head with a plain structure effectively eliminates redundant predictions, it is less effective than the proposed GNN structure. In the proposal anchor paradigm, the O2O classification head with a plain structure fails to eliminate redundant predictions due to high anchor overlap and similar RoI features. Thus, GNN is essential for PolarRCNN in the NMS-free paradigm. Both in the fixed and proposal anchor paradigms, the O2O classification head with the GNN structure successfully eliminates redundant predictions, indicating that our GNN-based O2O classification head can replace NMS post-processing in sparse scenarios without a loss in performance. This confirms our earlier theory that both structure and label assignment are crucial for an NMS-free paradigm.
|
To compare the NMS-free paradigm with the traditional NMS paradigm, we perform experiments with the NMS-free block under both proposal and fixed anchor strategies. Table \ref{NMS vs NMS-free} presents the results of these experiments. Here, O2M-B refers to the O2M classification head, O2O-B refers to the O2O classification head with a plain structure, and O2O-G refers to the O2O classification head with our proposed GNN structure. To assess the ability to eliminate redundant predictions, NMS post-processing is applied to each head. The results show that NMS is necessary for the traditional O2M classification head. In the fixed anchor paradigm, although the O2O classification head with a plain structure effectively eliminates redundant predictions, it is less effective than the proposed GNN structure. In the proposal anchor paradigm, the O2O classification head with a plain structure fails to eliminate redundant predictions due to high anchor overlap and similar RoI features. Thus, GNN is essential for Polar R-CNN in the NMS-free paradigm. Both in the fixed and proposal anchor paradigms, the O2O classification head with the GNN structure successfully eliminates redundant predictions, indicating that our GNN-based O2O classification head can replace NMS post-processing in sparse scenarios without a loss in performance. This confirms our earlier theory that both structure and label assignment are crucial for a NMS-free paradigm.
|
||||||
|
|
||||||
We also explore the stop-gradient strategy for the O2O classification head. As shown in Table \ref{stop}, the gradient of the O2O classification head negatively impacts both the O2M classification head (with NMS post-processing) and the O2O classification head. This suggests that one-to-one assignment introduces critical bias into feature learning.
|
We also explore the stop-gradient strategy for the O2O classification head. As shown in Table \ref{stop}, the gradient of the O2O classification head negatively impacts both the O2M classification head (with NMS post-processing) and the O2O classification head. This suggests that one-to-one assignment introduces critical bias into feature learning.
|
||||||
|
|
||||||
\begin{table}[h]
|
\begin{table}[h]
|
||||||
\centering
|
\centering
|
||||||
\caption{Ablation study on nms-free block}
|
\caption{Ablation study on NMS-free block}
|
||||||
\begin{adjustbox}{width=\linewidth}
|
\begin{adjustbox}{width=\linewidth}
|
||||||
\begin{tabular}{cccc|ccc}
|
\begin{tabular}{cccc|ccc}
|
||||||
\toprule
|
\toprule
|
||||||
@ -903,7 +903,7 @@ We also explore the stop-gradient strategy for the O2O classification head. As s
|
|||||||
\bottomrule
|
\bottomrule
|
||||||
\end{tabular}\
|
\end{tabular}\
|
||||||
\end{adjustbox}
|
\end{adjustbox}
|
||||||
\label{aba_nmsfree_block}
|
\label{aba_NMSfree_block}
|
||||||
\end{table}
|
\end{table}
|
||||||
|
|
||||||
|
|
||||||
@ -941,7 +941,7 @@ We also explore the stop-gradient strategy for the O2O classification head. As s
|
|||||||
\bottomrule
|
\bottomrule
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
\end{adjustbox}
|
\end{adjustbox}
|
||||||
\label{nms vs nmsfree}
|
\label{NMS vs NMS-free}
|
||||||
\end{table}
|
\end{table}
|
||||||
|
|
||||||
|
|
||||||
@ -972,7 +972,7 @@ We also explore the stop-gradient strategy for the O2O classification head. As s
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
\textbf{Ablation study on NMS-free block in desse scenrios.} Despite demonstrating the feasibility of replacing NMS with the O2O classification head in sparse scenarios, the shortcomings of NMS in dense scenarios remain. To investigate the performance of the NMS-free block in dense scenarios, we conduct experiments on the CurveLanes dataset, as detailed in Table \ref{aba_nms_dense}.
|
\textbf{Ablation study on NMS-free block in desse scenarios.} Despite demonstrating the feasibility of replacing NMS with the O2O classification head in sparse scenarios, the shortcomings of NMS in dense scenarios remain. To investigate the performance of the NMS-free block in dense scenarios, we conduct experiments on the CurveLanes dataset, as detailed in Table \ref{aba_NMS_dense}.
|
||||||
|
|
||||||
In the traditional NMS post-processing \cite{clrernet}, the default IoU threshold is set to 50 pixels. However, this default setting may not always be optimal, especially in dense scenarios where some lane predictions might be erroneously eliminated. Lowering the IoU threshold increases recall but decreases precision. To find the most effective IoU threshold, we experimented with various values and found that a threshold of 15 pixels achieves the best trade-off, resulting in an F1-score of 86.81\%. In contrast, the NMS-free paradigm with the GNN-based O2O classification head achieves an overall F1-score of 87.29\%, which is 0.48\% higher than the optimal threshold setting in the NMS paradigm. Additionally, both precision and recall are improved under the NMS-free approach. This indicates that the GNN-based O2O classification head is capable of learning semantic distances between anchors in addition to geometric distances, thus providing a more effective solution for dense scenarios compared to the traditional NMS approach.
|
In the traditional NMS post-processing \cite{clrernet}, the default IoU threshold is set to 50 pixels. However, this default setting may not always be optimal, especially in dense scenarios where some lane predictions might be erroneously eliminated. Lowering the IoU threshold increases recall but decreases precision. To find the most effective IoU threshold, we experimented with various values and found that a threshold of 15 pixels achieves the best trade-off, resulting in an F1-score of 86.81\%. In contrast, the NMS-free paradigm with the GNN-based O2O classification head achieves an overall F1-score of 87.29\%, which is 0.48\% higher than the optimal threshold setting in the NMS paradigm. Additionally, both precision and recall are improved under the NMS-free approach. This indicates that the GNN-based O2O classification head is capable of learning semantic distances between anchors in addition to geometric distances, thus providing a more effective solution for dense scenarios compared to the traditional NMS approach.
|
||||||
|
|
||||||
@ -984,7 +984,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
\toprule
|
\toprule
|
||||||
\textbf{Paradigm} & \textbf{NMS thres(pixel)} & \textbf{F1(\%)} & \textbf{Precision(\%)} & \textbf{Recall(\%)} \\
|
\textbf{Paradigm} & \textbf{NMS thres(pixel)} & \textbf{F1(\%)} & \textbf{Precision(\%)} & \textbf{Recall(\%)} \\
|
||||||
\midrule
|
\midrule
|
||||||
\multirow{7}*{PolarRCNN-NMS}
|
\multirow{7}*{Polar R-CNN-NMS}
|
||||||
& 50 (default) &85.38&\textbf{91.01}&80.40\\
|
& 50 (default) &85.38&\textbf{91.01}&80.40\\
|
||||||
& 40 &85.97&90.72&81.68\\
|
& 40 &85.97&90.72&81.68\\
|
||||||
& 30 &86.26&90.44&82.45\\
|
& 30 &86.26&90.44&82.45\\
|
||||||
@ -993,14 +993,14 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
& 15 (optimal) &86.81&89.64&84.16\\
|
& 15 (optimal) &86.81&89.64&84.16\\
|
||||||
& 10 &86.58&88.62&\textbf{84.64}\\
|
& 10 &86.58&88.62&\textbf{84.64}\\
|
||||||
\midrule
|
\midrule
|
||||||
PolarRCNN & - &\textbf{87.29}&90.50&84.31\\
|
Polar R-CNN & - &\textbf{87.29}&90.50&84.31\\
|
||||||
\bottomrule
|
\bottomrule
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
\end{adjustbox}
|
\end{adjustbox}
|
||||||
\label{aba_nms_dense}
|
\label{aba_NMS_dense}
|
||||||
\end{table}
|
\end{table}
|
||||||
|
|
||||||
\textbf{Visualization.} We present the PolarRCNN predictions for both sparse and dense scenarios. Figure \ref{vis_sparse} displays the predictions for sparse scenarios across four datasets. The local polar head effectively proposes anchors that are clustered around the ground truth, providing a robust prior for the RoI stage to achieve the final lane predictions. Moreover, the number of anchors has significantly decreased compared to previous work, making our method faster than other anchor-based methods in theory. Figure \ref{vis_dense} shows the predictions for dense scenarios. We observe that NMS@50 mistakenly removes some predictions, leading to false negatives, while NMS@15 fails to eliminate redundant predictions, resulting in false positives. This highlights the trade-off between using a large IoU threshold and a small IoU threshold. The visualization clearly demonstrates that geometric distance becomes less effective in dense scenarios. Only the O2O classification head, driven by data, can address this issue by capturing semantic distance beyond geometric distance. As shown in Figure \ref{vis_dense}, the O2O classification head successfully eliminates redundant true predictions while retaining dense predictions with small geometric distances.
|
\textbf{Visualization.} We present the Polar R-CNN predictions for both sparse and dense scenarios. Fig. \ref{vis_sparse} displays the predictions for sparse scenarios across four datasets. The local polar head effectively proposes anchors that are clustered around the ground truth, providing a robust prior for the RoI stage to achieve the final lane predictions. Moreover, the number of anchors has significantly decreased compared to previous works, making our method faster than other anchor-based methods in theory. Fig. \ref{vis_dense} shows the predictions for dense scenarios. We observe that NMS@50 mistakenly removes some predictions, leading to false negatives, while NMS@15 fails to eliminate redundant predictions, resulting in false positives. This highlights the trade-off between using a large IoU threshold and a small IoU threshold. The visualization clearly demonstrates that geometric distance becomes less effective in dense scenarios. Only the O2O classification head, driven by data, can address this issue by capturing semantic distance beyond geometric distance. As shown in Fig. \ref{vis_dense}, the O2O classification head successfully eliminates redundant true predictions while retaining dense predictions with small geometric distances.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1159,7 +1159,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/redun_pred15.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/redun_pred15.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{\subwidth}
|
\begin{subfigure}{\subwidth}
|
||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/redun_nmsfree.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/redun_NMSfree.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\vspace{0.5em}
|
\vspace{0.5em}
|
||||||
|
|
||||||
@ -1173,7 +1173,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/redun2_pred15.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/redun2_pred15.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{\subwidth}
|
\begin{subfigure}{\subwidth}
|
||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/redun2_nmsfree.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/redun2_NMSfree.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\vspace{0.5em}
|
\vspace{0.5em}
|
||||||
|
|
||||||
@ -1188,7 +1188,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/less_pred15.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/less_pred15.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{\subwidth}
|
\begin{subfigure}{\subwidth}
|
||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/less_nmsfree.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/less_NMSfree.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\vspace{0.5em}
|
\vspace{0.5em}
|
||||||
|
|
||||||
@ -1202,7 +1202,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/less2_pred15.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/less2_pred15.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{\subwidth}
|
\begin{subfigure}{\subwidth}
|
||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/less2_nmsfree.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/less2_NMSfree.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\vspace{0.5em}
|
\vspace{0.5em}
|
||||||
|
|
||||||
@ -1216,7 +1216,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/all_pred15.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/all_pred15.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{\subwidth}
|
\begin{subfigure}{\subwidth}
|
||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/all_nmsfree.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/all_NMSfree.jpg}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\vspace{0.5em}
|
\vspace{0.5em}
|
||||||
|
|
||||||
@ -1233,7 +1233,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
\caption{NMS@15}
|
\caption{NMS@15}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\begin{subfigure}{\subwidth}
|
\begin{subfigure}{\subwidth}
|
||||||
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/all2_nmsfree.jpg}
|
\includegraphics[width=\imgwidth, height=\imgheight]{thsis_figure/view_nms/all2_NMSfree.jpg}
|
||||||
\caption{NMSFree}
|
\caption{NMSFree}
|
||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\vspace{0.5em}
|
\vspace{0.5em}
|
||||||
@ -1244,7 +1244,7 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
|
|
||||||
|
|
||||||
\section{Conclusion and Future Work}
|
\section{Conclusion and Future Work}
|
||||||
In this paper, we propose PolarRCNN to address two key issues in anchor-based lane detection methods. By incorporating a local and global polar coordinate system, our PolarRCNN achieves improved performance with fewer anchors. Additionally, the introduction of a GNN-based O2O classification head allows us to replace the traditional NMS post-processing, and the NMS-free paradigm demonstrates superior performance in dense scenarios. Our model is highly flexible and the number of anchors can be adjusted based on the specific scenario. Users have the option to use either the O2M classification head with NMS post-processing or the O2O classification head for an NMS-free approach. PolarRCNN is also deployment-friendly due to its simple structure, making it a potential new baseline for lane detection. Future work could explore incorporating new structures, such as large kernels or attention mechanisms, and experimenting with new label assignment, training, and anchor sampling strategies. We also plan to extend PolarRCNN to video instance lane detection and 3D lane detection, utilizing advanced geometric modeling for these new tasks.
|
In this paper, we propose Polar R-CNN to address two key issues in anchor-based lane detection methods. By incorporating a local and global polar coordinate system, our Polar R-CNN achieves improved performance with fewer anchors. Additionally, the introduction of a GNN-based O2O classification head allows us to replace the traditional NMS post-processing, and the NMS-free paradigm demonstrates superior performance in dense scenarios. Our model is highly flexible and the number of anchors can be adjusted based on the specific scenario. Users have the option to use either the O2M classification head with NMS post-processing or the O2O classification head for a NMS-free approach. Polar R-CNN is also deployment-friendly due to its simple structure, making it a potential new baseline for lane detection. Future work could explore incorporating new structures, such as large kernels or attention mechanisms, and experimenting with new label assignment, training, and anchor sampling strategies. We also plan to extend Polar R-CNN to video instance lane detection and 3D lane detection, utilizing advanced geometric modeling for these new tasks.
|
||||||
%
|
%
|
||||||
%
|
%
|
||||||
%
|
%
|
||||||
|
4
make.sh
4
make.sh
@ -1,7 +1,5 @@
|
|||||||
# latexmk -c
|
# latexmk -c
|
||||||
# latexmk -pvc -xelatex -interaction=nonstopmode main.tex
|
# latexmk -pvc -xelatex -interaction=nonstopmode main.tex
|
||||||
|
latexmk -pdf main.tex
|
||||||
latexmk -quiet -interaction=nonstopmode --pvc --pdf main.tex
|
latexmk -quiet -interaction=nonstopmode --pvc --pdf main.tex
|
||||||
# latexmk -pdf -interaction=nonstopmode -pvc main.tex
|
# latexmk -pdf -interaction=nonstopmode -pvc main.tex
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -140,8 +140,8 @@
|
|||||||
year={2018}
|
year={2018}
|
||||||
}
|
}
|
||||||
|
|
||||||
@inproceedings{condlanenet,
|
@inproceedings{CondLaneNet,
|
||||||
title={Condlanenet: a top-to-down lane detection framework based on conditional convolution},
|
title={CondLaneNet: a top-to-down lane detection framework based on conditional convolution},
|
||||||
author={Liu, Lizhe and Chen, Xiaohao and Zhu, Siyu and Tan, Ping},
|
author={Liu, Lizhe and Chen, Xiaohao and Zhu, Siyu and Tan, Ping},
|
||||||
booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
|
booktitle={Proceedings of the IEEE/CVF international conference on computer vision},
|
||||||
pages={3773--3782},
|
pages={3773--3782},
|
||||||
@ -265,7 +265,7 @@
|
|||||||
year={2021}
|
year={2021}
|
||||||
}
|
}
|
||||||
|
|
||||||
@inproceedings{learnnms,
|
@inproceedings{learnNMS,
|
||||||
title={Learning non-maximum suppression},
|
title={Learning non-maximum suppression},
|
||||||
author={Hosang, Jan and Benenson, Rodrigo and Schiele, Bernt},
|
author={Hosang, Jan and Benenson, Rodrigo and Schiele, Bernt},
|
||||||
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
|
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
|
||||||
|
Loading…
x
Reference in New Issue
Block a user