This commit is contained in:
ShqWW 2024-09-27 16:31:24 +08:00
parent 03de77e908
commit 7f069ea240
8 changed files with 97 additions and 49 deletions

105
main.tex
View File

@ -54,11 +54,11 @@ Lane Detection, NMS-Free, Graph Neural Network, Polar Coordinate System.
\end{IEEEkeywords}
\section{Introduction}
\IEEEPARstart{L}{ane} detection is a critical task in computer vision and autonomous driving, aimed at identifying and tracking lane markings on the road. While extensive research has been conducted in ideal environments, it is still challenging in adverse scenarios such as night driving, glare, crowd, and rainy conditions, where lanes may be occluded or damaged. Moreover, the slender shapes and complex topologies of lanes further complicate detection efforts. %Therefore, an effective lane detection method should take into account both global high-level semantic features and local low-level features to address these varied conditions and ensure robust performances in a real-time application. along with their global properties,
\IEEEPARstart{L}{ane} detection is a critical task in computer vision and autonomous driving, aimed at identifying and tracking lane markings on the road \cite{adas}. While extensive research has been conducted in ideal environments, it is still challenging in adverse scenarios such as night driving, glare, crowd, and rainy conditions, where lanes may be occluded or damaged \cite{scnn}. Moreover, the slender shapes and complex topologies of lanes further complicate detection efforts \cite{polylanenet}. %Therefore, an effective lane detection method should take into account both global high-level semantic features and local low-level features to address these varied conditions and ensure robust performances in a real-time application. along with their global properties,
\par
In the past few decades, a lot of methods primarily focus on handcrafted local feature extraction and lane shape modeling. Techniques such as the \textit{Canny edge detector}\cite{cannyedge},\textit{ Hough transform}\cite{houghtransform}, and \textit{deformable templates}\cite{kluge1995deformable} have been widely employed for lane fitting. However, these approaches often face limitations in real-world scenarios, especially when low-level and local features lack clarity and distinctiveness.
\par
In recent years, advancements in deep learning and the availability of large datasets have led to significant progress in lane detection, especially deep models such as \textit{Convolutional Neural Networks} (CNNs)\cite{scnn} and \textit{transformer-based} architectures \cite{lstr}. Based on this, earlier approaches typically framed lane detection as a \textit{segmentation task} \cite{lanenet}, which, despite its straightforward, required time-consuming computations. There are still some methods that rely on \textit{parameter-based} models, which directly output lane curve parameters rather than pixel locations \cite{bezierlanenet}\cite{polylanenet}\cite{lstr}. Although these segmentation-based and parameter-based methods provide end-to-end solutions, their sensitivity to lane shape compromises their robustness.
In recent years, advancements in deep learning and the availability of large datasets have led to significant progress in lane detection, especially deep models such as \textit{Convolutional Neural Networks} (CNNs)\cite{scnn} and \textit{transformer-based} architectures \cite{lstr}. Based on this, earlier approaches typically framed lane detection as a \textit{segmentation task} \cite{lanenet}, which, despite its straightforward, required time-consuming computations. There are still some methods that rely on \textit{parameter-based} models, which directly output lane curve parameters rather than pixel locations \cite{polylanenet}\cite{lstr}\cite{bezierlanenet}. Although these segmentation-based and parameter-based methods provide end-to-end solutions, their sensitivity to lane shape compromises their robustness.
\begin{figure}[t]
\centering
\def\subwidth{0.24\textwidth}
@ -118,14 +118,14 @@ In recent years, advancements in deep learning and the availability of large dat
Drawing inspiration from object detection methods such as \textit{YOLO} \cite{yolov10} and \textit{Faster R-CNN} \cite{fasterrcnn}, several anchor-based approaches have been introduced for lane detection, with representative works including \textit{LaneATT} \cite{laneatt} and \textit{CLRNet} \cite{clrnet}. These methods have shown superior performance by leveraging anchor \textit{priors} (as shown in Fig. \ref{anchor setting}) and enabling larger receptive fields for feature extraction. However, anchor-based methods encounter similar drawbacks to those in general object detection, including the following:
\begin{itemize}
\item As shown in Fig. \ref{anchor setting}(a), a large number of lane anchors are predefined in the image, even in \textbf{\textit{sparse scenarios}}---the situations where lanes are distributed widely and located far apart from each other, as illustrated in the Fig. \ref{anchor setting}(d).
\item A \textit{Non-Maximum Suppression} (NMS) post-processing step is required to eliminate redundant predictions but may struggle in \textbf{\textit{dense scenarios}} where lanes are close to each other, such as forked lanes and double lanes, as illustrated in the Fig. \ref{NMS setting}(a).
\item A \textit{Non-Maximum Suppression} (NMS) \cite{nms} post-processing step is required to eliminate redundant predictions but may struggle in \textbf{\textit{dense scenarios}} where lanes are close to each other, such as forked lanes and double lanes, as illustrated in the Fig. \ref{NMS setting}(a).
\end{itemize}
\par
Regrading the first issue, \cite{clrnet} introduced learned anchors that optimize the anchor parameters during training to better adapt to lane distributions, as shown in Fig. \ref{anchor setting}(b). However, the number of anchors remains excessive to adequately cover the diverse potential distributions of lanes. Furthermore, \cite{adnet} proposes flexible anchors for each image by generating start points, rather than using a fixed set of anchors. Nevertheless, these start points of lanes are subjective and lack clear visual evidence due to the global nature of lanes. In contrast, \cite{srlane} uses a local angle map to propose sketch anchors according to the direction of ground truth. While this approach considers directional alignment, it neglects precise anchor positioning, resulting in suboptimal performance. Overall, the abundance of anchors is unnecessary in sparse scenarios.% where lane ground truths are sparse. The trend in new methodologies is to reduce the number of anchors while offering more flexible anchor configurations.%, which negatively impacts its performance. They also employ cascade cross-layer anchor refinement to bring the anchors closer to the ground truth. in the absence of cascade anchor refinement
\par
Regarding the second issue, nearly all anchor-based methods \cite{laneatt}\cite{clrnet}\cite{adnet}\cite{srlane} rely on direct or indirect NMS post-processing to eliminate redundant predictions. Although it is necessary to eliminate redundant predictions, NMS remains a suboptimal solution. On one hand, NMS is not deployment-friendly because it requires defining and calculating distances between lane pairs using metrics such as \textit{Intersection over Union} (IoU). This task is more challenging than in general object detection due to the intricate geometry of lanes. On the other hand, NMS can struggle in dense scenarios. Typically, a large distance threshold may lead to false negatives, as some true positive predictions could be mistakenly eliminated, as illustrated in Fig. \ref{NMS setting}(a)(c). Conversely, a small distance threshold may fail to eliminate redundant predictions effectively, resulting in false positives, as shown in Fig. \ref{NMS setting}(b)(d). Therefore, achieving an optimal trade-off across all scenarios by manually setting the distance threshold is challenging. %The root of this problem lies in the fact that the distance definition in NMS considers only geometric parameters while ignoring the semantic context in the image. As a result, when two predictions are ``close'' to each other, it is nearly impossible to determine whether one of them is redundant.% where lane ground truths are closer together than in sparse scenarios;including those mentioned above,
\par
To address the above two issues, we propose Polar R-CNN, a novel anchor-based method for lane detection. For the first issue, we introduce local and global heads based on the polar coordinate system to create anchors with more accurate locations, thereby reducing the number of proposed anchors in sparse scenarios, as illustrated in Fig. \ref{anchor setting}(c). In contrast to \textit{State-Of-The-Art} (SOTA) methods \cite{clrnet}\cite{clrernet}, which utilize 192 anchors, Polar R-CNN employs only 20 anchors to effectively cover potential lane ground truths. For the second issue, we have revised Fast NMS to Graph-based Fast NMS, incorporating a new heuristic \textit{Graph Neural Network} (GNN) block (Polar GNN block) into the NMS head. The Polar GNN block offers an interpretable structure, achieving nearly equivalent performance in sparse scenarios and superior performance in dense scenarios. We conducted experiments on five major benchmarks: \textit{TuSimple} \cite{tusimple}, \textit{CULane} \cite{scnn}, \textit{LLAMAS} \cite{llamas}, \textit{CurveLanes} \cite{curvelanes}, and \textit{DL-Rail} \cite{dalnet}. Our proposed method demonstrates competitive performance compared to SOTA approaches. Our main contributions are summarized as follows:
To address the above two issues, we propose Polar R-CNN, a novel anchor-based method for lane detection. For the first issue, we introduce local and global heads based on the polar coordinate system to create anchors with more accurate locations, thereby reducing the number of proposed anchors in sparse scenarios, as illustrated in Fig. \ref{anchor setting}(c). In contrast to \textit{State-Of-The-Art} (SOTA) methods \cite{clrnet}\cite{clrernet}, which utilize 192 anchors, Polar R-CNN employs only 20 anchors to effectively cover potential lane ground truths. For the second issue, we have revised Fast NMS \cite{yolact} to Graph-based Fast NMS, incorporating a new heuristic \textit{Graph Neural Network} (GNN) \cite{gnn} block (Polar GNN block) into the NMS head. The Polar GNN block offers an interpretable structure, achieving nearly equivalent performance in sparse scenarios and superior performance in dense scenarios. We conducted experiments on five major benchmarks: \textit{TuSimple} \cite{tusimple}, \textit{CULane} \cite{scnn}, \textit{LLAMAS} \cite{llamas}, \textit{CurveLanes} \cite{curvelanes}, and \textit{DL-Rail} \cite{dalnet}. Our proposed method demonstrates competitive performance compared to SOTA approaches. Our main contributions are summarized as follows:
\begin{itemize}
\item We design a strategy to simplify the anchor parameters by using local and global polar coordinate systems and applied these to two-stage lane detection frameworks. Compared to other anchor-based methods, this strategy significantly reduces the number of proposed anchors while achieving better performance.
\item We propose a novel Polar GNN block to implement a NMS-free paradigm. The block is inspired by Graph-based Fast NMS, providing enhanced interpretability. Our Polar GNN block supports end-to-end training and testing while still allowing for traditional NMS post-processing as an option for a NMS version of our model.
@ -135,7 +135,7 @@ To address the above two issues, we propose Polar R-CNN, a novel anchor-based me
\begin{figure*}[ht]
\centering
\includegraphics[width=0.99\linewidth]{thesis_figure/ovarall_architecture.png}
\caption{An illustration of the Polar R-CNN architecture. It has a similar pipelines with the Faster R-CNN for the task of object detection, and consists of a backbone, a FPN with three levels of feature maps, respectively denote by $P_0, P_1, P_2$, followed by a \textit{Local Polar Module}, and a RoI pooling module to extract features fed to a \textit{Global Polar Module} for lane detection. Based on the designed lane representation and lane anchor representation in polar coordinate system, the local polar module can propose sparse line anchors and the global polar module can produce the robust and accurate lane predictions. The global polar module includes a triplet head, which comprises a \textit{one-to-one (O2O)} classification head, a \textit{one-to-many} (O2M) classification head , and a \textit{one-to-many} (O2M) regression head.}
\caption{An illustration of the Polar R-CNN architecture. It has a similar pipelines with the Faster R-CNN for the task of object detection, and consists of a backbone, a \textit{Feature Pyramid Network} with three levels of feature maps, respectively denote by $P_0, P_1, P_2$, followed by a \textit{Local Polar Module}, and a RoI pooling module to extract features fed to a \textit{Global Polar Module} for lane detection. Based on the designed lane representation and lane anchor representation in polar coordinate system, the local polar module can propose sparse line anchors and the global polar module can produce the robust and accurate lane predictions. The global polar module includes a triplet head, which comprises a \textit{one-to-one (O2O)} classification head, a \textit{one-to-many} (O2M) classification head , and a \textit{one-to-many} (O2M) regression head.}
\label{overall_architecture}
\end{figure*}
\section{Related Works}
@ -147,9 +147,9 @@ categorizes lane instances by angles and locations, allowing it to detect only a
\par
\textbf{Parameter-based Methods.} Instead of predicting a series of points locations or pixel classifications, the parameter-based methods directly generate the curve parameters of lane instances. For example, \textit{PolyLanenet}\cite{polylanenet} and \textit{LSTR}\cite{lstr} consider the lane instance as a polynomial curve, outputting the polynomial coefficients directly. \textit{BézierLaneNet}\cite{bezierlanenet} treats the lane instance as a Bézier curve, generating the locations of their control points, while \textit{BSLane}\cite{bsnet} uses B-Spline to describe the lane, with curve parameters that emphasize local lane shapes. These parameter-based methods are mostly end-to-end and do not require post-processing, resulting in faster inference speed. However, since the final visual lane shapes are sensitive to their shapes, the robustness and generalization of these methods may not be optimal.
\par
\textbf{Anchor-Based Methods.} These methods are inspired by general object detection models, such as YOLO \cite{yolov10} and Faster R-CNN \cite{fasterrcnn}, for lane detection. The earliest work is Line-CNN, which utilizes line anchors designed as rays emitted from the three edges (left, bottom, and right) of an image. However, the models receptive field is limited to the edges, rendering it suboptimal for capturing the entirety of the lane. LaneATT \cite{laneatt} improves upon this by employing anchor-based feature pooling to aggregate features along the entire line anchor, achieving faster speeds and better performance. Nevertheless, its grid sampling strategy and label assignment still pose limitations. A key advantage of the anchor-based methods is their flexibility, allowing the integration of strategies from anchor-based object detection. For example, \textit{CLRNet} \cite{clrnet} enhances the performance with \textit{cross-layer refinement strategies}, \textit{SimOTA label assignment} \cite{yolox}, and \textit{LIOU loss}, outperforming many previous methods. They also have some essential drawbacks, e.g., lane anchors are often handcrafted and numerous. Some approaches, such as \textit{ADNet} \cite{adnet}, \textit{SRLane} \cite{srlane}, and \textit{Sparse Laneformer} \cite{sparse}, attempt to reduce the number of anchors and provide more flexible proposals; however, this can slightly impact performance. Additionally, methods such as \cite{clrernet} \cite{adnet} still rely on NMS post-processing, complicating NMS threshold settings and model deployment. Although one-to-one label assignment during training, without NMS \cite{detr}\cite{o2o} during evaluation, alleviates this issue, its performance is still less satisfactory compared to NMS-based models.
\textbf{Anchor-Based Methods.} These methods are inspired by general object detection models, such as YOLO \cite{yolov10} and Faster R-CNN \cite{fasterrcnn}, for lane detection. The earliest work is Line-CNN, which utilizes line anchors designed as rays emitted from the three edges (left, bottom, and right) of an image. However, the models receptive field is limited to the edges, rendering it suboptimal for capturing the entirety of the lane. LaneATT \cite{laneatt} improves upon this by employing anchor-based feature pooling to aggregate features along the entire line anchor, achieving faster speeds and better performance. Nevertheless, its grid sampling strategy and label assignment still pose limitations. A key advantage of the anchor-based methods is their flexibility, allowing the integration of strategies from anchor-based object detection. For example, \textit{CLRNet} \cite{clrnet} enhances the performance with \textit{cross-layer refinement strategies}, \textit{SimOTA label assignment} \cite{yolox}, and \textit{LIOU loss}, outperforming many previous methods. They also have some essential drawbacks, \textit{e.g.}, lane anchors are often handcrafted and numerous. Some approaches, such as \textit{ADNet} \cite{adnet}, \textit{SRLane} \cite{srlane}, and \textit{Sparse Laneformer} \cite{sparse}, attempt to reduce the number of anchors and provide more flexible proposals; however, this can slightly impact performance. Additionally, methods such as \cite{adnet}\cite{clrernet} still rely on NMS post-processing, complicating NMS threshold settings and model deployment. Although one-to-one label assignment during training, without NMS \cite{detr}\cite{o2o} during evaluation, alleviates this issue, its performance is still less satisfactory compared to NMS-based models.
\par
\textbf{NMS-free Methods.} Due to the threshold sensitivity and computational overhead of NMS, many studies attempt to NMF-free methods or models that do not use NMS during the detection process. For example, \textit{DETR} \cite{detr} employs one-to-one label assignment to avoid redundant predictions without using NMS. Other NMS-free methods \cite{learnNMS}\cite{date} \cite{yolov10} have also been proposed to addressing this issue from two aspects: \textit{model architecture} and \textit{label assignment}. For example, studies in \cite{date} \cite{yolov10} suggest that one-to-one assignments are crucial for NMS-free predictions, but maintaining one-to-many assignments is still necessary to ensure effective feature learning of the model. While some works in \cite{o3d} \cite{relationnet} consider the models expressive capacity to provide non-redundant predictions. However, compared to the extensive studies conducted in general object detection, there has been limited research analyzing the NMS-free paradigm.
\textbf{NMS-free Methods.} Due to the threshold sensitivity and computational overhead of NMS, many studies attempt to NMF-free methods or models that do not use NMS during the detection process. For example, \textit{DETR} \cite{detr} employs one-to-one label assignment to avoid redundant predictions without using NMS. Other NMS-free methods \cite{yolov10}\cite{learnNMS}\cite{date} have also been proposed to addressing this issue from two aspects: \textit{model architecture} and \textit{label assignment}. For example, studies in \cite{yolov10}\cite{date} suggest that one-to-one assignments are crucial for NMS-free predictions, but maintaining one-to-many assignments is still necessary to ensure effective feature learning of the model. While some works in \cite{o3d} \cite{relationnet} consider the models expressive capacity to provide non-redundant predictions. However, compared to the extensive studies conducted in general object detection, there has been limited research analyzing the NMS-free paradigm.
\par
In this work, we aim to address the above two issues in the framework of anchor-based detection to achieve NMF-free and non-redundant lane predictions.
%
@ -169,11 +169,11 @@ In this work, we aim to address the above two issues in the framework of anchor-
\includegraphics[width=\imgwidth]{thesis_figure/coord/polar.png}
\caption{}
\end{subfigure}
\caption{Different descriptions for anchor parameters: (a) Ray: defined by its starting point and direction $\theta$. (b) Polar: defined by its radius $r$ and angle $\theta$.} %rectangular coordinates
\caption{Different descriptions for anchor parameters: (a) Ray: defined by its start point (\textit{e.g.} the green point $\left( x_{1}^{s},y_{1}^{s} \right)$ or the yellow point $\left( x_{2}^{s},y_{2}^{s} \right) $) and direction $\theta$. (b) Polar: defined by its radius $r$ and angle $\theta$.} %rectangular coordinates
\label{coord}
\end{figure}
%
The overall architecture of our Polar R-CNN is illustrated in Fig. \ref{overall_architecture}. As shown in this figure, our Polar R-CNN for lane detection has a similar pipeline with Faster R-CNN \cite{fasterrcnn}, which consists of a backbone, a \textit{Feature Pyramid Network} (FPN), a \textit{Region Proposal Network} (RPN) followed by a \textit{Local Polar Module} (LPM), and \textit{Region of Interest} (RoI) pooling module followed by a \textit{Global Polar Module} (GPM). In the following, we first introduce the polar coordinate representation of lane and lane anchors, and then present the designed LPM and GPM in our Polar R-CNN. %To investigate the fundamental factors affecting model performance, such as anchor settings and NMS post-processing, and also to enhance ease of deployment, our Polar R-CNN utilizes a simple and straightforward network structure. just relying on basic components, including convolutional or pooling operations, \textit{Multi-Layer Perceptrons} (MLPs), while deliberately excluding advanced elements like \textit{attention mechanisms}, \textit{dynamic kernels}, and \textit{cross-layer refinement} used in previous works \cite{clrnet}\cite{clrernet}.
The overall architecture of our Polar R-CNN is illustrated in Fig. \ref{overall_architecture}. As shown in this figure, our Polar R-CNN for lane detection has a similar pipeline with Faster R-CNN \cite{fasterrcnn}, which consists of a backbone\cite{resnet}, a \textit{Feature Pyramid Network} (FPN) \cite{fpn}, a \textit{Region Proposal Network} (RPN) \cite{fasterrcnn} followed by a \textit{Local Polar Module} (LPM), and \textit{Region of Interest} (RoI) \cite{fasterrcnn} pooling module followed by a \textit{Global Polar Module} (GPM). In the following, we first introduce the polar coordinate representation of lane and lane anchors, and then present the designed LPM and GPM in our Polar R-CNN. %To investigate the fundamental factors affecting model performance, such as anchor settings and NMS post-processing, and also to enhance ease of deployment, our Polar R-CNN utilizes a simple and straightforward network structure. just relying on basic components, including convolutional or pooling operations, \textit{Multi-Layer Perceptrons} (MLPs), while deliberately excluding advanced elements like \textit{attention mechanisms}, \textit{dynamic kernels}, and \textit{cross-layer refinement} used in previous works \cite{clrnet}\cite{clrernet}.
%\par
%
@ -181,22 +181,22 @@ The overall architecture of our Polar R-CNN is illustrated in Fig. \ref{overall_
%
Lanes are characterized by their thin, elongated, and curved shapes. A well-defined lane prior aids the model in feature extraction and location prediction.
\par
\textbf{Lane and Anchor Representation as Ray.} Given an input image with dimensions of length $W$ and height $H$, a lane is represented by a set of 2D points $X=\{(x_1,y_1),(x_2,y_2),\cdots,(x_N,y_N)\}$ with equally spaced y-coordinates, i.e., $y_i=i\times\frac{H}{N}$, where $N$ is the number of data points. Since the y-coordinate is fixed, a lane can be uniquely defined by its x-coordinates. Previous studies \cite{linecnn}\cite{laneatt} have introduced \textit{lane priors}, also known as \textit{lane anchors}, which are represented as straight lines in the image plane and served as references. From a geometric perspective, a lane anchor can be viewed as a ray defined by a starting point $(x_{0},y_{0})$ located at the edge of an image (left/bottom/right boundaries), along with a direction $\theta$. The primary task of a lane detection model is to estimate the x-coordinate offset from the lane anchor to the ground truth of the lane instance.
\textbf{Lane and Anchor Representation as Ray.} Given an input image with dimensions of length $W$ and height $H$, a lane is represented by a set of 2D points $X=\{(x_1,y_1),(x_2,y_2),\cdots,(x_N,y_N)\}$ with equally spaced y-coordinates, i.e., $y_i=i\times\frac{H}{N}$, where $N$ is the number of data points. Since the y-coordinate is fixed, a lane can be uniquely defined by its x-coordinates. Previous studies \cite{linecnn}\cite{laneatt} have introduced \textit{lane priors}, also known as \textit{lane anchors}, which are represented as straight lines in the image plane and served as references. From a geometric perspective, a lane anchor can be viewed as a ray defined by a start point $(x_{0},y_{0})$ located at the edge of an image (left/bottom/right boundaries), along with a direction $\theta$. The primary task of a lane detection model is to estimate the x-coordinate offset from the lane anchor to the ground truth of the lane instance.
\par
However, the representation of lane anchors as rays presents certain limitations. Notably, a lane anchor can have an infinite number of potential starting points, which makes the definition of its starting point ambiguous and subjective. As illustrated in Fig. \ref{coord}(a), the studies in \cite{dalnet}\cite{laneatt}\cite{linecnn} define the starting points as being located at the boundaries of an image, such as the green point in Fig. \ref{coord}(a). In contrast, the research presented in \cite{adnet} defines the starting points, exemplified by the purple point in Fig. \ref{coord}(a), based on their actual visual locations within the image. Moreover, occlusion and damage to the lane significantly affect the detection of these starting points, highlighting the need for the model to have a large receptive field \cite{adnet}. Essentially, a straight lane has two degrees of freedom: the slope and the intercept, under a Cartesian coordinate system, implying that the lane anchor could be described using just two parameters instead of the three redundant parameters (\textit{i.e.}, two for the start point and one for orientation) employed in ray representation.
However, the representation of lane anchors as rays presents certain limitations. Notably, a lane anchor can have an infinite number of potential start points, which makes the definition of its start point ambiguous and subjective. As illustrated in Fig. \ref{coord}(a), the studies in \cite{dalnet}\cite{laneatt}\cite{linecnn} define the start points as being located at the boundaries of an image, such as the green point in Fig. \ref{coord}(a). In contrast, the research presented in \cite{adnet} defines the start points, exemplified by the purple point in Fig. \ref{coord}(a), based on their actual visual locations within the image. Moreover, occlusion and damage to the lane significantly affect the detection of these start points, highlighting the need for the model to have a large receptive field \cite{adnet}. Essentially, a straight lane has two degrees of freedom: the slope and the intercept, under a Cartesian coordinate system, implying that the lane anchor could be described using just two parameters instead of the three redundant parameters (\textit{i.e.}, two for the start point and one for orientation) employed in ray representation.
%
\begin{figure}[t]
\centering
\includegraphics[width=0.87\linewidth]{thesis_figure/coord/localpolar.png}
\caption{The local polar coordinate system. The ground truth of the radius $\hat{r}_{i}^{l}$ of a local pole is defines as the minimum distance from the pole to the lane curve instance. A positive pole has a radius $\hat{r}_{i}^{l}$ that is below a threshold $\tau_{l}$, and vice versa. Additionally, the ground truth angle is determined by the angle formed between the radius vector (connecting the pole to the closest point on the lanes) and the local polar axis.}
\label{lphlabel}
\caption{The local polar coordinate system. The ground truth of the radius $\hat{r}_{i}^{l}$ of the $i$-th local pole is defines as the minimum distance from the pole to the lane curve instance. A positive pole has a radius $\hat{r}_{i}^{l}$ that is below a threshold $\tau^{l}$, and vice versa. Additionally, the ground truth angle $\hat{\theta}^l_i$ is determined by the angle formed between the radius vector (connecting the pole to the closest point on the lanes) and the local polar axis.}
\label{lpmlabel}
\end{figure}
\par
\textbf{Representation in Polar Coordinate.} As stated above, lane anchors represented by rays have some drawbacks. To address these issues, we introduce a polar coordinate representation of lane anchors. In mathematics, the polar coordinate is a two-dimensional coordinate system in which each point on a plane is determined by a distance from a reference point (also called the pole) and an angle $\theta$ from a reference direction (called polar axis). As shown in Fig. \ref{coord}(b), given a polar corresponding to the yellow point, a lane anchor for a straight line can be uniquely defined by two parameters: the radial distance from the pole (called radius), $r$, and the counterclockwise angle from the polar axis to the perpendicular line of the lane anchor, $\theta$, with $r \in \mathbb{R}$ and $\theta\in\left(-\frac{\pi}{2}, \frac{\pi}{2}\right]$.
\par
To better leverage the local inductive bias properties of CNNs, we define two types of polar coordinate systems: the local and global coordinate systems. The local polar coordinate system is to generate lane anchors, while the global coordinate system expresses these anchors in a form within the entire image and regresses them to the ground truth lane instances. Given the distinct roles of the local and global systems, we adopt a two-stage framewrok for our Polar R-CNN, similar to Faster R-CNN\cite{fasterrcnn}.
\par
The local polar system is designed to predict lane anchors adaptable to both sparse and dense scenarios. In this system, there are many poles with each as the lattice point of the feature map, referred to as local poles. As illustrated on the left side of Fig. \ref{lphlabel}, there are two types of local poles: positive and negative. Positive local poles (e.g., the blue points) have a radius $r_{i}^{l}$ below a threshold $\tau_l$, otherwise, they are classified as negative local poles (e.g., the red points). Each local pole is responsible for predicting a single lane anchor. While a lane ground truth may generate multiple lane anchors, as shown in Fig. \ref{lphlabel}, there are three positive poles around the lane instance (green lane), which are expected to generate three lane anchors. This one-to-many approach is essential for ensuring comprehensive anchor proposals, especially since some local features around certain poles may be lost due to damage or occlusion of the lane curve.
The local polar system is designed to predict lane anchors adaptable to both sparse and dense scenarios. In this system, there are many poles with each as the lattice point of the feature map, referred to as local poles. As illustrated on the left side of Fig. \ref{lpmlabel}, there are two types of local poles: positive and negative. Positive local poles (\textit{e.g.}, the blue points) have a radius $r_{i}^{l}$ below a threshold $\tau^l$, otherwise, they are classified as negative local poles (\textit{e.g.}, the red points). Each local pole is responsible for predicting a single lane anchor. While a lane ground truth may generate multiple lane anchors, as shown in Fig. \ref{lpmlabel}, there are three positive poles around the lane instance (green lane), which are expected to generate three lane anchors. This one-to-many approach is essential for ensuring comprehensive anchor proposals, especially since some local features around certain poles may be lost due to damage or occlusion of the lane curve.
\par
In the local polar coordinate system, the parameters of each lane anchor are determined based on the location of its corresponding local pole. However, in practical terms, once a lane anchor is generated, its position becomes fixed and independent from its original local pole. To simplify the representation of lane anchors in the second stage of Polar-RCNN, a global polar system has been designed, featuring a single pole that serves as a reference point for the entire image. The location of this global pole is manually set, and in this case, it is positioned near the static vanishing point observed across the entire lane image dataset. This approach ensures a consistent and unified framework for expressing lane anchors within the global context of the image, facilitating accurate regression to the ground truth lane instances.
@ -204,68 +204,64 @@ In the local polar coordinate system, the parameters of each lane anchor are det
\centering
\includegraphics[width=0.45\textwidth]{thesis_figure/local_polar_head.png}
\caption{The main architecture of local polar module.}
\label{lph}
\label{lpm}
\end{figure}
\subsection{Local Polar Module}
As shown in Fig. \ref{overall_architecture}, three levels of feature maps, denoted as $P_1, P_2, P_3$, are extracted using a \textit{Feature Pyramid Network} (FPN). To generate high-quality anchors around the lane ground truths within an image, we introduce the \textit{Local Polar Module} (LPM), which takes feature maps $P_1, P_2, P_3$ as input and outputs a set of lane anchors along with their confidence scores. Taking the highest level feature map $P_3\in\mathbb{R}^{C_{f} \times H_{f} \times W_{f}}$ as an example, as demonstrated in Fig. \ref{lph}, it undergoes a \textit{downsampling} operation $DS(\cdot)$ to produce a lower-dimensional feature map of a size $H^l\times W^l$:
As shown in Fig. \ref{overall_architecture}, three levels of feature maps, denoted as $P_1, P_2, P_3$, are extracted using a \textit{Feature Pyramid Network} (FPN). To generate high-quality anchors around the lane ground truths within an image, we introduce the \textit{Local Polar Module} (LPM), which takes the highest feature map $P_3\in\mathbb{R}^{C_{f} \times H_{f} \times W_{f}}$ as input and outputs a set of lane anchors along with their confidence scores. As demonstrated in Fig. \ref{lpm}, it undergoes a \textit{downsampling} operation $DS(\cdot)$ to produce a lower-dimensional feature map of a size $H^l\times W^l$:
\begin{equation}
F_d\gets DS\left( P_{3} \right)\ \text{and}\ F_d\in \mathbb{R} ^{C_f\times H^{l}\times W^{l}}.
\end{equation}
The downsampled feature map $F_d$ is then fed into two branches: a \textit{regression} branch $\phi _{reg}^{lpm}\left(\cdot \right)$ and a \textit{classification} branch $\phi _{cls}^{lpm}\left(\cdot \right)$, \textit{i.e.},
\begin{align}
F_{reg\,\,}\gets \phi _{reg}^{lpm}\left( F_d \right)\ &\text{and}\ F_{reg\,\,}\in \mathbb{R} ^{2\times H^{l}\times W^{l}},\\
F_{cls}\gets \phi _{cls}^{lpm}\left( F_d \right)\ &\text{and}\ F_{cls}\in \mathbb{R} ^{H^{l}\times W^{l}}. \label{lph equ}
F_{cls}\gets \phi _{cls}^{lpm}\left( F_d \right)\ &\text{and}\ F_{cls}\in \mathbb{R} ^{H^{l}\times W^{l}}. \label{lpm equ}
\end{align}
The regression branch consists of a single $1\times1$ convolutional layer and with the goal of generating lane anchors by outputting their angles $\theta_{j}$ and the radius $r^{l}_{j}$, \textit{i.e.}, $F_{reg\,\,} \equiv \left\{\theta^{l}_{j}, r^{l}_{j}\right\}_{j=1}^{H^{l}\times W^{l}}$, in the defined local polar coordinate system previously introduced. Similarly, the classification branch $\phi _{cls}^{lph}\left(\cdot \right)$ only consists of two $1\times1$ convolutional layers for simplicity. This branch is to predict the confidence heat map $F_{cls\,\,}\equiv \left\{ c_j \right\} _{j=1}^{H^l\times W^l}$ for local poles, each associated with a feature point. By discarding local poles with lower confidence, the module increases the likelihood of selecting potential positive foreground lane anchors while effectively removing background lane anchors.
The regression branch consists of a single $1\times1$ convolutional layer and with the goal of generating lane anchors by outputting their angles $\theta_j$ and the radius $r^{l}_{j}$, \textit{i.e.}, $F_{reg\,\,} \equiv \left\{\theta_{j}, r^{l}_{j}\right\}_{j=1}^{H^{l}\times W^{l}}$, in the defined local polar coordinate system previously introduced. Similarly, the classification branch $\phi _{cls}^{lpm}\left(\cdot \right)$ only consists of two $1\times1$ convolutional layers for simplicity. This branch is to predict the confidence heat map $F_{cls\,\,}\equiv \left\{ s_j^l \right\} _{j=1}^{H^l\times W^l}$ for local poles, each associated with a feature point. By discarding local poles with lower confidence, the module increases the likelihood of selecting potential positive foreground lane anchors while effectively removing background lane anchors.
\par
\textbf{Loss Function for Training the LPM.} To train the local polar module, we define the ground truth labels for each local pole as follows: the ground truth radius, $\hat{r}^l_i$, is set to be the minimum distance from a local pole to the corresponding lane curve, while the ground truth angle, $\hat{\theta}^l_i$, is set to be the orientation of the vector extending from the local pole to the nearest point on the curve. A positive pole is labeled as one; otherwise, it is labeled as zero. Consequently, we have a label set of local poles $F_{gt}=\{c_j^l\}_{j=1}^{H^l\times W^l}$, where $c_j^*=1$ if the $j$-th local pole is positive and $c_j^l=0$ if it is negative. Once the regression and classification labels are established, as shown in Fig. \ref{lphlabel}, the LPM can be trained using the \textit{smooth-L}1 loss $s\left(\cdot \right)$ for regression branch and the \textit{binary cross-entropy} loss $BCE\left( \cdot , \cdot \right)$ for classification branch. The loss functions for the LPM are given as follows:
\textbf{Loss Function for Training the LPM.} To train the local polar module, we define the ground truth labels for each local pole as follows: the ground truth radius, $\hat{r}^l_i$, is set to be the minimum distance from a local pole to the corresponding lane curve, while the ground truth angle, $\hat{\theta}_i$, is set to be the orientation of the vector extending from the local pole to the nearest point on the curve. A positive pole is labeled as one; otherwise, it is labeled as zero. Consequently, we have a label set of local poles $F_{gt}=\{\hat{s}_j^l\}_{j=1}^{H^l\times W^l}$, where $\hat{s}_j^l=1$ if the $j$-th local pole is positive and $\hat{s}_j^l=0$ if it is negative. Once the regression and classification labels are established, as shown in Fig. \ref{lpmlabel}, the LPM can be trained using the \textit{smooth-L}1 loss $S_{L1}\left(\cdot \right)$ for regression branch and the \textit{binary cross-entropy} loss $BCE\left( \cdot , \cdot \right)$ for classification branch. The loss functions for the LPM are given as follows:
\begin{align}
\mathcal{L} _{lpm}^{cls}&=BCE\left( F_{cls},F_{gt} \right), \\
\mathcal{L} _{lpm}^{r\mathrm{e}g}&=\frac{1}{N_{lpm}^{pos}}\sum_{j\in \left\{j|\hat{r}_j^l<\tau_{l} \right\}}{\left( s\left( \theta_j^l-\hat{\theta}_j^l \right) +s\left( r_j^l-\hat{r}_j^l \right) \right)}, \label{loss_lph}
\mathcal{L} ^{lpm}_{cls}&=BCE\left( F_{cls},F_{gt} \right), \\
\mathcal{L} ^{lpm}_{reg}&=\frac{1}{N^{lpm}_{pos}}\sum_{j\in \left\{j|\hat{r}_j^l<\tau^{l} \right\}}{\left( S_{L1}\left( \theta_j-\hat{\theta}_j \right) +S_{L1}\left( r_j^l-\hat{r}_j^l \right) \right)}, \label{loss_lph}
\end{align}
where $N_{lpm}^{pos}=\left|\{j|\hat{r}_j^l<\tau_{l}\}\right|$ is the number of positive local poles in the LPM.
where $N^{lpm}_{pos}=\left|\{j|\hat{r}_j^l<\tau^{l}\}\right|$ is the number of positive local poles in the LPM.
\par
\textbf{Top-$K$ Anchor Selection.} As discussed above, all $H^{l}\times W^{l}$ anchors, each associated with a point in the feature map, are considered as candidate anchor during training the LPM. It is helpful to our Polar R-CNN to learn from a sufficient variety of features, including negative anchor samples. However, only the top-$K$ anchors with the highest confidence scores $\{c_j^l\}$ are selected and fed into the next stage. This strategy effectively filters out potential negative anchors and reduces the computational complexity of our Polar R-CNN. By doing this, it maintains the adaptability and flexibility of anchor distribution while decreasing the total number of anchors. The following experiments will demonstrate the effectiveness of our top-$K$ anchor selection strategy.
%
\textbf{Top-$K$ Anchor Selection.} As discussed above, all $H^{l}\times W^{l}$ anchors, each associated with a point in the feature map, are considered as candidate anchor during the training stage. It is helpful to our Polar R-CNN (the second stage) to learn from a sufficient variety of features, including negative anchor samples. However, only the top-$K$ anchors with the highest confidence scores $\{c_j^l\}$ are selected and fed into the next stage during the evaluation stage. This strategy effectively filters out potential negative anchors and reduces the computational complexity of the second stage. By doing this, it maintains the adaptability and flexibility of anchor distribution while decreasing the total number of anchors. The following experiments will demonstrate the effectiveness of our top-$K$ anchor selection strategy.
\begin{figure}[t]
\centering
\includegraphics[width=0.89\linewidth]{thesis_figure/detection_head.png} %
\caption{The main architecture of GPM. It consists of the RoI Pooling Layer and triplet heads (\textit{i.e.} the O2O classification head, O2M classification head and O2M regression head). The predictions from the O2M classification head $\left\{s_i^g\right\}$ are redundant and require NMS postprocessing. The O2O classification head serves as a replacement for NMS, directly outputting the non-redundant predictions (also denoted as $\left\{\tilde{s}_i^g\right\}$) based on the output scores from the O2M classification head.}
\label{gpm}
\end{figure}
\subsection{Global Polar Module}
Similar to the pipeline of Faster R-CNN, the LPM serves as the first stage for generating lane proposals. As illustrated in Fig. \ref{overall_architecture}, we introduce a novel \textit{Global Polar Module} (GPM) as the second stage to achieve accurate lane prediction. The GPM takes features extracted by a \textit{Region of Interest} (ROI) pooling layer as input and outputs the precise lane location and confidence scores through a triplet head.
Similar to the pipeline of Faster R-CNN, the LPM serves as the first stage for generating lane anchor proposals. As illustrated in Fig. \ref{overall_architecture}, we introduce a novel \textit{Global Polar Module} (GPM) as the second stage to achieve accurate lane prediction. The GPM takes features samples from anchors and outputs the precise location and confidence scores of final lane detection results. The overall architecture of GPM is illustrated in the Fig. \ref{gpm}.
\par
\textbf{RoI Pooling Layer.} It is designed to extract relevant areas of the feature map. For ease of operation, we first convert the radius of the positive lane anchors in a local polar coordinate, $r_j^l$, to the one in a global polar coordinate system, $r_j^g$, by the following equation
\textbf{RoI Pooling Layer.} It is designed to extract sampled features from lane anchors. For ease of the sampling operation, we first convert the radius of the positive lane anchors in a local polar coordinate, $r_j^l$, to the one in a global polar coordinate system, $r_j^g$, by the following equation
\begin{align}
r^{g}_{j}&=r^{l}_{j}+\left( \boldsymbol{c}^{l}_{j}-\boldsymbol{c}^{g} \right) ^{T}\left[\cos\theta_{j}; \sin\theta_{j} \right], \\
j&=1,2,\cdots,N_{lpm}^{pos},\notag
j&=1,2,\cdots,N^{lpm}_{pos},\notag
\end{align}
where $\boldsymbol{c}^{l}_{j} \in \mathbb{R}^{2}$ and $\boldsymbol{c}^{g} \in \mathbb{R}^{2}$ represent the Cartesian coordinates of $j$-th local pole and the global pole, respectively. Note that we keep the angle $\theta_j$ unchanged, since the local and global polar coordinate system have the same polar axis, as shown in Fig. \ref{lphlabel}. And next, the feature points are sampled on each lane anchors by
where $\boldsymbol{c}^{l}_{j} \in \mathbb{R}^{2}$ and $\boldsymbol{c}^{g} \in \mathbb{R}^{2}$ represent the Cartesian coordinates of $j$-th local pole and the global pole, respectively. Note that we keep the angle $\theta_j$ unchanged, since the local and global polar coordinate system have the same polar axis, as shown in Fig. \ref{lpmlabel}. And next, the feature points are sampled on each lane anchors by
\begin{align}
x_{i,j}&=-y_{i,j}\tan \theta_j +\frac{r^{g}_j}{\cos \theta_j},\label{positions}\\
i&=1,2,\cdots,N,\notag
i&=1,2,\cdots,N_p,\notag
\end{align}
where the y-coordinates $\{y_{1,j}, y_{2,j},\cdots,y_{N,j}\}$ of the $j$-th lane anchor are uniformly sampled vertically from the image, as previously mentioned.
\par
Given the feature maps $P_1, P_2, P_3$ from FPN, we can extract feature vectors corresponding to the positions of feature points $\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots,(x_{N,j},y_{N,j})\}_{j=1}^{N_{lpm}^{pos}}$, respectively denoted as $\boldsymbol{F}_{1}, \boldsymbol{F}_{2}, \boldsymbol{F}_{3}\in \mathbb{R} ^{N_{lpm}^{pos}\times C_f}$. To enhance representation, similar to \cite{detr}, we employ a weighted sum strategy to combine features from different levels as
Given the feature maps $P_1, P_2, P_3$ from FPN, we can extract feature vectors corresponding to the positions of feature points $\{(x_{1,j},y_{1,j}),(x_{2,j},y_{2,j}),\cdots,(x_{N,j},y_{N,j})\}_{j=1}^{N^{lpm}_{pos}}$, respectively denoted as $\boldsymbol{F}_{1}, \boldsymbol{F}_{2}, \boldsymbol{F}_{3}\in \mathbb{R} ^{N^{lpm}_{pos}\times C_f}$. To enhance representation, similar to \cite{detr}, we employ a weighted sum strategy to combine features from different levels as
\begin{equation}
\boldsymbol{F}^s=\sum_{k=1}^3{\boldsymbol{F}_{k}\otimes \frac{e^{\boldsymbol{w}_{k}}}{\sum_{k=0}^3{e^{\boldsymbol{w}_{k}}}}},
\end{equation}
where $\boldsymbol{w}_{k}\in \mathbb{R} ^{N_{lpm}^{pos}}$ represents the learnable aggregate weight, serving as a learned model weight. Instead of concatenating the three sampling features into $\boldsymbol{F}^s\in \mathbb{R} ^{N_p\times d_f\times 3}$ directly, the adaptive summation significantly reduces the feature dimensions to $\boldsymbol{F}^s\in \mathbb{R} ^{N_p\times d_f}$, which is one-third of the original dimension. The weighted sum tensors are then fed into fully connected layers to obtain the pooled RoI features of an anchor:
where $\boldsymbol{w}_{k}\in \mathbb{R} ^{N^{lpm}_{pos}}$ represents the learnable aggregate weight, serving as a learned model weight. Instead of concatenating the three sampling features into $\boldsymbol{F}^s\in \mathbb{R} ^{N_p\times d_f\times 3}$ directly, the adaptive summation significantly reduces the feature dimensions to $\boldsymbol{F}^s\in \mathbb{R} ^{N_p\times d_f}$, which is one-third of the original dimension. The weighted sum tensors are then fed into fully connected layers to obtain the pooled RoI features of an anchor:
\begin{equation}
\begin{aligned}
\boldsymbol{F}^{roi}\gets FC_{pooling}\left( \boldsymbol{F}^s \right), \boldsymbol{F}^{roi}\in \mathbb{R} ^{d_r},
\boldsymbol{F}^{roi}\gets FC^{pooling}\left( \boldsymbol{F}^s \right), \boldsymbol{F}^{roi}\in \mathbb{R} ^{d_r},
\end{aligned}
\end{equation}
\textbf{Triplet Head.} The triplet head comprises three distinct heads: the one-to-one classification (O2O cls) head, the one-to-many classification (O2M cls) head, and the one-to-many regression (O2M reg) head. In various studies \cite{laneatt}\cite{clrnet}\cite{adnet}\cite{srlane}, the detection head predominantly follows the one-to-many paradigm. During the training phase, multiple positive samples are assigned to a single ground truth. Consequently, during the evaluation stage, redundant detection results are often predicted for each instance. These redundancies are typically addressed using NMS, which eliminates duplicate results and retains the highest confidence detection for each groung truth. However, NMS relies on the definition of distance between detection results, and this calculation can be complex for curved lanes and other irregular geometric shapes. To achieve non-redundant detection results with a NMS-free paradigm, the one-to-one paradigm becomes crucial during training, as highlighted in \cite{o2o}. Nevertheless, merely adopting the one-to-one paradigm is insufficient; the structure of the detection head also plays a pivotal role in achieving NMS-free detection. This aspect will be further analyzed in the following sections.
%
%
\begin{figure}[t]
\centering
\includegraphics[width=0.89\linewidth]{thesis_figure/detection_head.png} %
\caption{The main architecture of global polar module.}
\label{gph}
\end{figure}
%
%
\par
\textbf{NMS vs NMS-free.} Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from $i_{th}$ anchors and the three subheads using $\boldsymbol{F}^{roi}_{i}$ as input. For now, let us focus on the O2M classification (O2M cls) head and the O2M regression (O2M reg) head, which follow the old paradigm used in previous work and can serve as a baseline for the new one-to-one paradigm. To maintain simplicity and rigor, both the O2M classification head and the O2M regression head consist of two layers with activation functions, featuring a plain structure without any complex mechanisms such as attention or deformable convolution. as previously mentioned, merely replacing the one-to-many label assignment with one-to-one label assignment is insufficient for eliminating NMS post-processing. This is because anchors often exhibit significant overlap or are positioned very close to each other, as shown in Fig. \ref{anchor setting}(b)\&(c). Let the $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ represent the features from two overlapping (or very close) anchors, implying that $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ will be almost identical. Let $f_{plain}^{cls}$ denotes the neural structure used in O2M classification head and suppose it's trained with one-to-one label assignment. If $\boldsymbol{F}^{roi}_{i}$ is a positive sample and the $\boldsymbol{F}^{roi}_{j}$ is a negative sample, the ideal output should be as follows:
\textbf{NMS vs NMS-free.} Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from $i$-th anchors and the three subheads using $\boldsymbol{F}^{roi}_{i}$ as input. For now, let us focus on the O2M classification (O2M cls) head and the O2M regression (O2M reg) head, which follow the old paradigm used in previous work and can serve as a baseline for the new one-to-one paradigm. To maintain simplicity and rigor, both the O2M classification head and the O2M regression head consist of two layers with activation functions, featuring a plain structure without any complex mechanisms such as attention or deformable convolution. as previously mentioned, merely replacing the one-to-many label assignment with one-to-one label assignment is insufficient for eliminating NMS post-processing. This is because anchors often exhibit significant overlap or are positioned very close to each other, as shown in Fig. \ref{anchor setting}(b)\&(c). Let the $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ represent the features from two overlapping (or very close) anchors, implying that $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ will be almost identical. Let $f_{plain}^{cls}$ denotes the neural structure used in O2M classification head and suppose it's trained with one-to-one label assignment. If $\boldsymbol{F}^{roi}_{i}$ is a positive sample and the $\boldsymbol{F}^{roi}_{j}$ is a negative sample, the ideal output should be as follows:
\begin{equation}
\begin{aligned}
&\boldsymbol{F}_{i}^{roi}\approx \boldsymbol{F}_{j}^{roi},
@ -279,7 +275,7 @@ where $\boldsymbol{w}_{k}\in \mathbb{R} ^{N_{lpm}^{pos}}$ represents the learnab
The Eq. (\ref{sharp fun}) suggests that the property of $f_{cls}^{plain}$ need to be ``sharp'' enough to differentiate between two similar features. That is to say, the output of $f_{cls}^{plain}$ changes rapidly over short periods or distances, it implies that $f_{cls}^{plain}$ need to captures information with higher frequency. This issue is also discussed in \cite{o3d}. Capturing the high frequency with a plain structure is difficult because a naive MLP tends to capture information with lower frequency \cite{xu2022overview}. In the most extreme case, where $\boldsymbol{F}_{i}^{roi} = \boldsymbol{F}_{j}^{roi}$, it becomes impossible to distinguish the two anchors to positive and negative samples completely; in practice, both confidences converge to around 0.5. This problem arises from the limitations of the input format and the structure of the naive MLP, which restrict its expressive capability for information with higher frequency. Therefore, it is crucial to establish relationships between anchors and design a new model structure to effectively represent ``sharp'' information.
It is easy to see that the ``ideal'' one-to-one branch is equivalence to O2M cls branch with O2M regression and NMS post-processing. If the NMS could be replaced by some equivalent but learnable functions (e.g. a neural network with specific structure), the O2O head could be trained to handle the one-to-one assignment. However, the NMS involves sequential iteration and confidence sorting, which are challenging to reproduce with a neural network. Although previous works, such as RNN-based approaches \cite{stewart2016end}, utilize an iterative format, they are time-consuming and introduce additional complexity into the model training process due to their iterative nature. To eliminate the iteration process, we proposed a equivalent format of Fast NMS\cite{yolact}.
It is easy to see that the ``ideal'' one-to-one branch is equivalence to O2M cls branch with O2M regression and NMS post-processing. If the NMS could be replaced by some equivalent but learnable functions (\textit{e.g.} a neural network with specific structure), the O2O head could be trained to handle the one-to-one assignment. However, the NMS involves sequential iteration and confidence sorting, which are challenging to reproduce with a neural network. Although previous works, such as RNN-based approaches \cite{stewart2016end}, utilize an iterative format, they are time-consuming and introduce additional complexity into the model training process due to their iterative nature. To eliminate the iteration process, we proposed a equivalent format of Fast NMS\cite{yolact}.
\begin{algorithm}[t]
@ -349,7 +345,7 @@ Given a series of positive detections with redundancy, a detection result A is s
(1) The confidence of A is lower than that of B.
(2) The predefined distance (e.g. IoU distance and L1 distance) between A and B is smaller than a threshold.
(2) The predefined distance (\textit{e.g.} IoU distance and L1 distance) between A and B is smaller than a threshold.
(3) B is not suppressed by any other detection results.
@ -388,7 +384,7 @@ Eq. (\ref{edge_layer}) represents the implicit expression of Eq. (\ref{al_1-3}),
The Eq. (\ref{node_layer}) serves as the implicit replacement for Eq. (\ref{al_1-4}). In this approach, we use elementwise max pooling of tensors instead of scalar-based max operations. The pooled tensor is then fed into a neural network with a sigmoid activation function to directly obtain the confidence. By eliminating the need for a predefined distance threshold, all confidence calculation patterns are derived from the training data.
It should be noted that the O2O classification head depends on the predictons of O2M classification head as outlined in Eq. (\ref{al_1-1}). From a probablity percpective, the confidence output by O2M classification head, $s_{j}$, represents the probability that the $j_{th}$ detection is a positive sample. The confidence output by O2O classification head, $\tilde{s}_i$, denotes the conditional probablity that $i_{th}$ sample shouldn't be suppressed given the condition that the $i_{th}$ sample identified as a positive sample:
It should be noted that the O2O classification head depends on the predictons of O2M classification head as outlined in Eq. (\ref{al_1-1}). From a probablity percpective, the confidence output by O2M classification head, $s_{j}$, represents the probability that the $j$-th detection is a positive sample. The confidence output by O2O classification head, $\tilde{s}_i$, denotes the conditional probablity that $i$-th sample shouldn't be suppressed given the condition that the $i$-th sample identified as a positive sample:
\begin{equation}
\begin{aligned}
&s_j|_{j=1}^{N_a}\equiv P\left( a_j\,\,is\,\,pos \right), \,\,
@ -426,7 +422,7 @@ GLaneIoU\,\,=\,\,\frac{\sum\nolimits_{i=j}^k{d_{i}^{\mathcal{O}}}}{\sum\nolimits
\end{aligned}
\end{equation}
where j and k are the indices of the valid points (the start point and the end point). It's straightforward to observed that when $g=0$, the GLaneIoU is correspond to GIoU\cite{giouloss} for bounding box, with a value range of $\left[0, 1 \right]$. When $g=1$, the GLaneIoU is correspond to GIoU for bounding box, with a value range of $\left(-1, 1 \right]$. In general, when $g>0$, the value range of GLaneIoU is $\left(-g, 1 \right]$.
We then define the cost function between $i_{th}$ prediction and $j_{th}$ ground truth as follows like \cite{detr}:
We then define the cost function between $i$-th prediction and $j$-th ground truth as follows like \cite{detr}:
\begin{equation}
\begin{aligned}
\mathcal{C} _{ij}=\left(s_i\right)^{\beta_c}\times \left( GLaneIoU_{ij, g=0} \right) ^{\beta_r}.
@ -480,7 +476,7 @@ The anchors and ground truth are divided into several segments. Each anchor segm
The overall loss function of Polar R-CNN is given as follows:
\begin{equation}
\begin{aligned}
\mathcal{L}_{overall} &=\mathcal{L} _{lph}^{cls}+w_{lph}^{reg}\mathcal{L} _{lph}^{reg}\\&+w_{o2m}^{cls}\mathcal{L} _{o2m}^{cls}+w_{o2o}^{cls}\mathcal{L} _{o2o}^{cls}+w_{rank}\mathcal{L} _{rank}\\&+w_{IoU}\mathcal{L} _{IoU}+w_{end}\mathcal{L} _{end}+w_{aux}\mathcal{L} _{aux}.
\mathcal{L}_{overall} &=\mathcal{L} _{lpm}^{cls}+w_{lpm}^{reg}\mathcal{L} _{lpm}^{reg}\\&+w_{o2m}^{cls}\mathcal{L} _{o2m}^{cls}+w_{o2o}^{cls}\mathcal{L} _{o2o}^{cls}+w_{rank}\mathcal{L} _{rank}\\&+w_{IoU}\mathcal{L} _{IoU}+w_{end}\mathcal{L} _{end}+w_{aux}\mathcal{L} _{aux}.
\end{aligned}
\end{equation}
The first line in the loss function represents the loss for LPH, which includes both classification and regression components. The second line pertains to the losses associated with the two classification heads (O2M and O2O), while the third line represents the loss for the regression head within the triplet head. Each term in the equation is weighted by a factor to balance the contributions of each component to the gradient. The entire training process is end-to-end.
@ -1209,6 +1205,17 @@ In this paper, we propose Polar R-CNN to address two key issues in anchor-based
and image processing.
\end{IEEEbiography}
\vfill
\newpage
% 附录有多个section时
\appendices
\section{Title of the 1nd appendix}
This is the first paragraph of Appx. A ...
\section{Title of the 2nd appendix}
This is the first paragraph of Appx. B ..
\end{document}

View File

@ -488,3 +488,44 @@
pages={9387--9396},
year={2022}
}
@article{adas,
title={Recent progress in road and lane detection: a survey},
author={Bar Hillel, Aharon and Lerner, Ronen and Levi, Dan and Raz, Guy},
journal={Machine vision and applications},
volume={25},
number={3},
pages={727--745},
year={2014},
publisher={Springer}
}
@article{gnn,
title={A comprehensive survey on graph neural networks},
author={Wu, Zonghan and Pan, Shirui and Chen, Fengwen and Long, Guodong and Zhang, Chengqi and Philip, S Yu},
journal={IEEE transactions on neural networks and learning systems},
volume={32},
number={1},
pages={4--24},
year={2020},
publisher={IEEE}
}
@inproceedings{nms,
title={Efficient non-maximum suppression},
author={Neubeck, Alexander and Van Gool, Luc},
booktitle={18th international conference on pattern recognition (ICPR'06)},
volume={3},
pages={850--855},
year={2006},
organization={IEEE}
}
@inproceedings{fpn,
title={Feature pyramid networks for object detection},
author={Lin, Tsung-Yi and Doll{\'a}r, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
pages={2117--2125},
year={2017}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 MiB

After

Width:  |  Height:  |  Size: 1.6 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 MiB

After

Width:  |  Height:  |  Size: 1.6 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 MiB

After

Width:  |  Height:  |  Size: 1.6 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 133 KiB

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 MiB

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.