update
This commit is contained in:
parent
a2a4f997a8
commit
1043faa952
102
main.tex
102
main.tex
@ -21,7 +21,7 @@
|
|||||||
\usepackage[colorlinks,bookmarksopen,bookmarksnumbered, linkcolor=red]{hyperref}
|
\usepackage[colorlinks,bookmarksopen,bookmarksnumbered, linkcolor=red]{hyperref}
|
||||||
% \usepackage[table,xcdraw]{xcolor}
|
% \usepackage[table,xcdraw]{xcolor}
|
||||||
|
|
||||||
\definecolor{darkgreen}{RGB}{17,159,27} % 或者使用其他 RGB 值定义深绿色
|
\definecolor{darkgreen}{RGB}{17,159,27} %
|
||||||
\aboverulesep=0pt
|
\aboverulesep=0pt
|
||||||
\belowrulesep=0pt
|
\belowrulesep=0pt
|
||||||
\hyphenation{op-tical net-works semi-conduc-tor IEEE-Xpolare}
|
\hyphenation{op-tical net-works semi-conduc-tor IEEE-Xpolare}
|
||||||
@ -31,14 +31,20 @@
|
|||||||
|
|
||||||
\title{PolarRCNN:\@ End-to-End Lane Detection with Fewer Anchors}
|
\title{PolarRCNN:\@ End-to-End Lane Detection with Fewer Anchors}
|
||||||
|
|
||||||
\author{IEEE Publication Technology,~\IEEEmembership{Staff,~IEEE,}
|
\author{Shengqi Wang and Junmin Liu\\
|
||||||
% <-this % stops a space
|
|
||||||
\thanks{This work was supported in part by the National Natural Science Foundation of China under Grant 62276208 and 12326607, and in part by the Natural Science Basic Research Program of Shaanxi Province 2024]C-JCQN-02.}% <-this % stops a space
|
\thanks{This work was supported in part by the National Nature Science Foundation of China (Grant Nos. 62276208, 12326607) and in part by the Natural Science Basic Research Program of Shaanxi Province (Grant No. 2024JC-JCQN-02).}%
|
||||||
\thanks{Manuscript received April 19, 2021; revised August 16, 2021.}}
|
\thanks{S. Wang is with the School of Mathematics and Statistics, Xi'an Jiaotong University, Xi'an 710049, China, and is also with the School of Mathematics and Statistics, The University of Melbourne, VIC 3010 Australia.}
|
||||||
|
\thanks{J. Liu is with the School of Mathematics and Statistics, Xi'an Jiaotong University, Xi'an 710049, China.}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
%\thanks{Manuscript received April 19, 2021; revised August 16, 2021.}}
|
||||||
|
|
||||||
% The paper headers
|
% The paper headers
|
||||||
\markboth{Journal of \LaTeX\ Class Files,~Vol.~14, No.~8, August~2021}%
|
% The paper headers
|
||||||
{Shell \MakeLowercase{\textit{et al.}}: A Sample Article Using IEEEtran.cls for IEEE Journals}
|
\markboth{S. Wang \MakeLowercase{\textit{et al.}}: PolarRCNN:\@ End-to-End Lane Detection with Fewer Anchors}%
|
||||||
|
{S. Wang \MakeLowercase{\textit{et al.}}: PolarRCNN:\@ End-to-End Lane Detection with Fewer Anchors}
|
||||||
|
|
||||||
% \IEEEpubid{0000--0000/00\$00.00~\copyright~2021 IEEE}
|
% \IEEEpubid{0000--0000/00\$00.00~\copyright~2021 IEEE}
|
||||||
% Remember, if you use this you must call \IEEEpubidadjcol in the second
|
% Remember, if you use this you must call \IEEEpubidadjcol in the second
|
||||||
@ -163,7 +169,7 @@ The lane detection aims to detect lane instances in a image. In this section, we
|
|||||||
|
|
||||||
In this work, we aim to address to two issues in anchor-based lane detection mentioned above, the sparse lane anchor setting and NMS-free predictions.
|
In this work, we aim to address to two issues in anchor-based lane detection mentioned above, the sparse lane anchor setting and NMS-free predictions.
|
||||||
|
|
||||||
\section{Method}
|
\section{Proposed method}
|
||||||
The overall architecture of PolarRCNN is illustrated in Fig. \ref{overall_architecture}. Our model adheres to the Faster R-CNN \cite{fasterrcnn} framework, consisting of a backbone, FPN (Feature Pyramid Network), RPN (Region Proposal Network), and RoI (Region of Interest) pooling. To investigate the fundamental factors affecting model performance, such as anchor settings and NMS (Non-Maximum Suppression) postprocessing, and make the model easier to deploy, PolarRCNN employs a simple and straightforward network structure. It relies on basic components including convolutional layers, MLPs (Multi-Layer Perceptrons), and pooling operations, deliberately excluding advanced elements like attention mechanisms, dynamic kernels, and cross-layer refinement used in pervious works \cite{clrnet}\cite{clrernet}.
|
The overall architecture of PolarRCNN is illustrated in Fig. \ref{overall_architecture}. Our model adheres to the Faster R-CNN \cite{fasterrcnn} framework, consisting of a backbone, FPN (Feature Pyramid Network), RPN (Region Proposal Network), and RoI (Region of Interest) pooling. To investigate the fundamental factors affecting model performance, such as anchor settings and NMS (Non-Maximum Suppression) postprocessing, and make the model easier to deploy, PolarRCNN employs a simple and straightforward network structure. It relies on basic components including convolutional layers, MLPs (Multi-Layer Perceptrons), and pooling operations, deliberately excluding advanced elements like attention mechanisms, dynamic kernels, and cross-layer refinement used in pervious works \cite{clrnet}\cite{clrernet}.
|
||||||
|
|
||||||
\begin{table}[h]
|
\begin{table}[h]
|
||||||
@ -276,7 +282,7 @@ Global polar head (GPH) is a crucial component in the second stage of PolarRCNN.
|
|||||||
\textbf{RoI Pooling Module.} RoI pooling module is designed to transform features sampled from lane anchors into a standard feature tensor. Once the local polar parameters of a lane anchor are given, they can be converted to global polar coordinates using the following equation:
|
\textbf{RoI Pooling Module.} RoI pooling module is designed to transform features sampled from lane anchors into a standard feature tensor. Once the local polar parameters of a lane anchor are given, they can be converted to global polar coordinates using the following equation:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
r^{G}_{j}=r^{L}_{j}+\left( \textbf{c}^{L}_{j}-\textbf{c}^{G}_{j} \right) \left[\cos\theta_{j}, \sin\theta_{j} \right]^{T}
|
r^{G}_{j}=r^{L}_{j}+\left( \textbf{c}^{L}_{j}-\textbf{c}^{G}_{j} \right) ^{T}\left[\cos\theta_{j}, \sin\theta_{j} \right]
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
where $\textbf{c}^{L}_{j} \in \mathbb{R}^{2}$ and $\textbf{c}^{G} \in \mathbb{R}^{2}$ represent the Cartesian coordinates of local and global origins correspondingly.
|
where $\textbf{c}^{L}_{j} \in \mathbb{R}^{2}$ and $\textbf{c}^{G} \in \mathbb{R}^{2}$ represent the Cartesian coordinates of local and global origins correspondingly.
|
||||||
@ -304,10 +310,9 @@ Suppose the $P_{0}$, $P_{1}$ and $P_{2}$ denote the last three levels from FPN a
|
|||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
where $\boldsymbol{w}_{L}^{s}\in \mathbb{R} ^{N_p}$ represents the learnable aggregate weight, serving as a learned model weight. Instead of concatenating the three sampling features into $\boldsymbol{F}^s\in \mathbb{R} ^{N_p\times d_f\times 3}$ directly, the adaptive summation significantly reduces the feature dimensions to $\boldsymbol{F}^s\in \mathbb{R} ^{N_p\times d_f}$, which is one-third of the original dimension. The weighted sum tensors are then fed into fully connected layers to obtain the pooled RoI features of an anchor:
|
where $\boldsymbol{w}_{L}^{s}\in \mathbb{R} ^{N_p}$ represents the learnable aggregate weight, serving as a learned model weight. Instead of concatenating the three sampling features into $\boldsymbol{F}^s\in \mathbb{R} ^{N_p\times d_f\times 3}$ directly, the adaptive summation significantly reduces the feature dimensions to $\boldsymbol{F}^s\in \mathbb{R} ^{N_p\times d_f}$, which is one-third of the original dimension. The weighted sum tensors are then fed into fully connected layers to obtain the pooled RoI features of an anchor:
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
\boldsymbol{F}^{roi}\gets FC^{pooling}\left( \boldsymbol{F}^s \right) , \boldsymbol{F}^{roi}\in \mathbb{R} ^{d_r}
|
\boldsymbol{F}^{roi}\gets FC^{pooling}\left( \boldsymbol{F}^s \right), \boldsymbol{F}^{roi}\in \mathbb{R} ^{d_r}
|
||||||
\end{aligned}
|
\end{aligned}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
@ -375,8 +380,6 @@ where $\boldsymbol{w}_{L}^{s}\in \mathbb{R} ^{N_p}$ represents the learnable agg
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\textbf{NMS vs NMS-free.} Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from $i_{th}$ anchors and the three subheads using $\boldsymbol{F}^{roi}_{i}$ as input. For now, let us focus on the O2M classification (O2M cls) head and the O2M regression (O2M Reg) head, which follow the old paradigm used in previous work and can serve as a baseline for the new one-to-one paradigm. To maintain simplicity and rigor, both the O2M cls head and the O2M Reg head consist of two layers with activation functions, featuring a plain structure without any complex mechanisms such as attention or deformable convolution. s previously mentioned, merely replacing the one-to-many label assignment with one-to-one label assignment is insufficient for eliminating NMS postprocessing. This is because anchors often exhibit significant overlap or are positioned very close to each other, as shown in Fig. \ref{anchor setting} (b)(c). Let the $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ represent the features from two overlapping (or very close) anchors, implying that $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ will be almost identical. Let $f_{plain}^{cls}$ denotes the neural structure used in O2M cls head but trained with one-to-one label assignment. If $\boldsymbol{F}^{roi}_{i}$ is a positive sample and the $\boldsymbol{F}^{roi}_{j}$ is a negative sample, the ideal output should be as follows:
|
\textbf{NMS vs NMS-free.} Let $\boldsymbol{F}^{roi}_{i}$ denotes the ROI features extracted from $i_{th}$ anchors and the three subheads using $\boldsymbol{F}^{roi}_{i}$ as input. For now, let us focus on the O2M classification (O2M cls) head and the O2M regression (O2M Reg) head, which follow the old paradigm used in previous work and can serve as a baseline for the new one-to-one paradigm. To maintain simplicity and rigor, both the O2M cls head and the O2M Reg head consist of two layers with activation functions, featuring a plain structure without any complex mechanisms such as attention or deformable convolution. s previously mentioned, merely replacing the one-to-many label assignment with one-to-one label assignment is insufficient for eliminating NMS postprocessing. This is because anchors often exhibit significant overlap or are positioned very close to each other, as shown in Fig. \ref{anchor setting} (b)(c). Let the $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ represent the features from two overlapping (or very close) anchors, implying that $\boldsymbol{F}^{roi}_{i}$ and $\boldsymbol{F}^{roi}_{j}$ will be almost identical. Let $f_{plain}^{cls}$ denotes the neural structure used in O2M cls head but trained with one-to-one label assignment. If $\boldsymbol{F}^{roi}_{i}$ is a positive sample and the $\boldsymbol{F}^{roi}_{j}$ is a negative sample, the ideal output should be as follows:
|
||||||
|
|
||||||
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
&\boldsymbol{F}_{i}^{roi}\approx \boldsymbol{F}_{j}^{roi}
|
&\boldsymbol{F}_{i}^{roi}\approx \boldsymbol{F}_{j}^{roi}
|
||||||
@ -440,8 +443,7 @@ Equation \ref{edge_layer} represents the implicit expression of equation \ref{al
|
|||||||
|
|
||||||
The equation \ref{node_layer} serves as the implicit replacement for equation \ref{al_1-4}. In this approach, we use elementwise max pooling of tensors instead of scalar-based max operations. The pooled tensor is then fed into a neural network with a sigmoid activation function to directly obtain the confidence. By eliminating the need for a predefined distance threshold, all confidence calculation patterns are derived from the training data.
|
The equation \ref{node_layer} serves as the implicit replacement for equation \ref{al_1-4}. In this approach, we use elementwise max pooling of tensors instead of scalar-based max operations. The pooled tensor is then fed into a neural network with a sigmoid activation function to directly obtain the confidence. By eliminating the need for a predefined distance threshold, all confidence calculation patterns are derived from the training data.
|
||||||
|
|
||||||
It should be noted that the O2O cls head depends on the predictons of O2M cls head as outlined in equation \ref{al_1-1}. From a probablity percpective, the confidence output by O2M cls head, $s_{j}$
|
It should be noted that the O2O cls head depends on the predictons of O2M cls head as outlined in equation \ref{al_1-1}. From a probablity percpective, the confidence output by O2M cls head, $s_{j}$, represents the probability that the $j_{th}$ detection is a positive sample. The confidence output by O2O cls head, $\tilde{s}_i$, denotes the conditional probablity that $i_{th}$ sample shouldn't be supressed given the condition that the $i_{th}$ sample identified as a positive sample:
|
||||||
,represents the probability that the $j_{th}$ detection is a positive sample. The confidence output by O2O cls head, $\tilde{s}_i$, denotes the conditional probablity that $i_{th}$ sample shouldn't be supressed given the condition that the $i_{th}$ sample identified as a positive sample:
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\begin{aligned}
|
\begin{aligned}
|
||||||
&s_j|_{j=1}^{N_A}\equiv P\left( a_j\,\,is\,\,pos \right) \,\,
|
&s_j|_{j=1}^{N_A}\equiv P\left( a_j\,\,is\,\,pos \right) \,\,
|
||||||
@ -576,14 +578,12 @@ The first line in the loss function represents the loss for the local polar head
|
|||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\centering
|
\centering
|
||||||
\includegraphics[width=\linewidth]{thsis_figure/auxloss.png} % 替换为你的图片文件名
|
\includegraphics[width=\linewidth]{thsis_figure/auxloss.png} %
|
||||||
\caption{Auxloss for segment parameter regression.}
|
\caption{Auxloss for segment parameter regression.}
|
||||||
\label{auxloss}
|
\label{auxloss}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\section{Experiment}
|
\section{Experiment}
|
||||||
|
|
||||||
|
|
||||||
\subsection{Dataset and Evaluation Metric}
|
\subsection{Dataset and Evaluation Metric}
|
||||||
We conducted experiments on four widely used lane detection benchmarks and one rail detection dataset: CULane\cite{scnn}, TuSimple\cite{tusimple}, LLAMAS\cite{llamas}, CurveLanes\cite{curvelanes}, and DL-Rail\cite{dalnet}. Among these datasets, CULane and CurveLanes are particularly challenging. The CULane dataset consists various scenarios but has sparse lane distributions, whereas CurveLanes includes a large number of curved and dense lane types, such as forked and double lanes. The DL-Rail dataset, focused on rail detection across different scenarios, was chosen to evaluate our model’s performance beyond traditional lane detection. The details for five dataset are shown in Table. \ref{dataset_info}
|
We conducted experiments on four widely used lane detection benchmarks and one rail detection dataset: CULane\cite{scnn}, TuSimple\cite{tusimple}, LLAMAS\cite{llamas}, CurveLanes\cite{curvelanes}, and DL-Rail\cite{dalnet}. Among these datasets, CULane and CurveLanes are particularly challenging. The CULane dataset consists various scenarios but has sparse lane distributions, whereas CurveLanes includes a large number of curved and dense lane types, such as forked and double lanes. The DL-Rail dataset, focused on rail detection across different scenarios, was chosen to evaluate our model’s performance beyond traditional lane detection. The details for five dataset are shown in Table. \ref{dataset_info}
|
||||||
|
|
||||||
@ -1137,13 +1137,13 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\vspace{0.5em}
|
\vspace{0.5em}
|
||||||
|
|
||||||
\caption{The Visualization of the detection results of sparse scenarios.}
|
\caption{The visualization of the detection results of sparse scenarios.}
|
||||||
\label{vis_sparse}
|
\label{vis_sparse}
|
||||||
\end{figure*}
|
\end{figure*}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
\begin{figure*}[htbp]
|
\begin{figure*}[htbp!]
|
||||||
\centering
|
\centering
|
||||||
\def\subwidth{0.24\textwidth}
|
\def\subwidth{0.24\textwidth}
|
||||||
\def\imgwidth{\linewidth}
|
\def\imgwidth{\linewidth}
|
||||||
@ -1238,63 +1238,27 @@ In the traditional NMS post-processing \cite{clrernet}, the default IoU threshol
|
|||||||
\end{subfigure}
|
\end{subfigure}
|
||||||
\vspace{0.5em}
|
\vspace{0.5em}
|
||||||
|
|
||||||
\caption{The Visualization of the detection results of sparse scenarios.}
|
\caption{The visualization of the detection results of sparse scenarios.}
|
||||||
\label{vis_dense}
|
\label{vis_dense}
|
||||||
\end{figure*}
|
\end{figure*}
|
||||||
|
|
||||||
|
|
||||||
|
\section{Conclusion and Future Work}
|
||||||
\section{Conclusion}
|
In this paper, we propose PolarRCNN to address two key issues in anchor-based lane detection methods. By incorporating a local and global polar coordinate system, our PolarRCNN achieves improved performance with fewer anchors. Additionally, the introduction of a GNN-based O2O classification head allows us to replace the traditional NMS post-processing, and the NMS-free paradigm demonstrates superior performance in dense scenarios. Our model is highly flexible and the number of anchors can be adjusted based on the specific scenario. Users have the option to use either the O2M classification head with NMS post-processing or the O2O classification head for an NMS-free approach. PolarRCNN is also deployment-friendly due to its simple structure, making it a potential new baseline for lane detection. Future work could explore incorporating new structures, such as large kernels or attention mechanisms, and experimenting with new label assignment, training, and anchor sampling strategies. We also plan to extend PolarRCNN to video instance lane detection and 3D lane detection, utilizing advanced geometric modeling for these new tasks.
|
||||||
In this paper, we propose PolarRCNN to address two key issues in anchor-based lane detection methods. By incorporating a local and global polar coordinate system, our model achieves improved performance with fewer anchors. Additionally, the introduction of a GNN-based O2O classification head allows us to replace the traditional NMS post-processing, and the NMS-free paradigm demonstrates superior performance in dense scenarios. Our model is highly flexible; the number of anchors can be adjusted based on the specific scenario. Users have the option to use either the O2M classification head with NMS post-processing or the O2O classification head for an NMS-free approach. PolarRCNN is also deployment-friendly due to its simple structure, making it a potential new baseline for lane detection. Future work could explore incorporating new structures, such as large kernels or attention mechanisms, and experimenting with new label assignment, training, and anchor sampling strategies. We also plan to extend PolarRCNN to video instance lane detection and 3D lane detection, utilizing advanced geometric modeling for these new tasks.
|
%
|
||||||
|
%
|
||||||
|
%
|
||||||
% \section*{Acknowledgments}
|
|
||||||
% This should be a simple paragraph before the References to thank those individuals and institutions who have supported your work on this article.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
%{\appendices
|
|
||||||
%\section*{Proof of the First Zonklar Equation}
|
|
||||||
%Appendix one text goes here.
|
|
||||||
% You can choose not to have a title for an appendix if you want by leaving the argument blank
|
|
||||||
%\section*{Proof of the Second Zonklar Equation}
|
|
||||||
%Appendix two text goes here.}
|
|
||||||
|
|
||||||
|
|
||||||
\bibliographystyle{IEEEtran}
|
\bibliographystyle{IEEEtran}
|
||||||
\bibliography{reference}
|
\bibliography{reference}
|
||||||
|
%\newpage
|
||||||
|
\begin{IEEEbiographynophoto}{Shengqi Wang}
|
||||||
\newpage
|
received the Master degree from Xi'an Jiaotong University, Xi'an, China, in 2020. He is now pursuing for the Ph.D. degree in statistics at Xi'an Jiaotong University. His research interests include low-level computer vision, deep learning, and so on.
|
||||||
|
|
||||||
\section{Biography Section}
|
|
||||||
% If you have an EPS/PDF photo (graphicx package needed), extra braces are
|
|
||||||
% needed around the contents of the optional argument to biography to prevent
|
|
||||||
% the LaTeX parser from getting confused when it sees the complicated
|
|
||||||
% $\backslash${\tt{includegraphics}} command within an optional argument. (You can create
|
|
||||||
% your own custom macro containing the $\backslash${\tt{includegraphics}} command to make things
|
|
||||||
% simpler here.)
|
|
||||||
|
|
||||||
% \vspace{11pt}
|
|
||||||
|
|
||||||
% \bf{If you include a photo:}\vspace{-33pt}
|
|
||||||
% \begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{fig1}}]{Michael Shell}
|
|
||||||
% Use $\backslash${\tt{begin\{IEEEbiography\}}} and then for the 1st argument use $\backslash${\tt{includegraphics}} to declare and link the author photo.
|
|
||||||
% Use the author name as the 3rd argument followed by the biography text.
|
|
||||||
% \end{IEEEbiography}
|
|
||||||
|
|
||||||
% \vspace{11pt}
|
|
||||||
|
|
||||||
\bf{If you will not include a photo:}\vspace{-33pt}
|
|
||||||
\begin{IEEEbiographynophoto}{John Doe}
|
|
||||||
Use $\backslash${\tt{begin\{IEEEbiographynophoto\}}} and the author name as the argument followed by the biography text.
|
|
||||||
\end{IEEEbiographynophoto}
|
\end{IEEEbiographynophoto}
|
||||||
|
%
|
||||||
|
\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{thsis_figure/ljm.pdf}}]{Junmin Liu}
|
||||||
|
was born in 1982. He received the Ph.D. degree in Mathematics from Xi'an Jiaotong University, Xi'an, China, in 2013. From 2011 to 2012, he served as a Research Assistant with the Department of Geography and Resource Management at the Chinese University of Hong Kong, Hong Kong, China. From 2014 to 2017, he worked as a Visiting Scholar at the University of Maryland, College Park, USA. He is currently a full Professor at the School of Mathematics and Statistics, Xi'an Jiaotong University, Xi'an, China. His research interests are mainly focused on the theory and application of machine learning and image processing. He has published over 60+ research papers in international conferences and journals.
|
||||||
|
\end{IEEEbiography}
|
||||||
\vfill
|
\vfill
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user