C语言嵌入arm neon汇编问题

时间:2021-02-04 00:58:57
下面是一段深度学习前向计算中卷积用neon指令加速的代码,是C语言嵌入汇编实现的
                    "vmla.f32   q6, q10, %e20[1]    \n"
                    "vmla.f32   q12, q11, %f19[0]   \n"
挑两行出来问下,这里%e, %f是什么意思,我手上也有些汇编器手册,不过没查到这两个符号的含义,有知道的大佬吗,多谢!!

                asm volatile(

                    "pld        [%5, #192]          \n"
                    "vld1.f32   {d16-d18}, [%5   ] \n"// r0
                    "add        %5, #16             \n"

                    "pld        [%8, #192]          \n"
                    "vld1.f32   {d28-d30}, [%8]     \n"// r3
                    "add        %8, #16             \n"

                    "vext.32    q10, q8, q9, #1     \n"
                    "vext.32    q11, q14, q15, #2   \n"

                    "0:                             \n"

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d12-d13}, [%1   ] \n"// _sum0

                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d14-d15}, [%2   ] \n"// _sum1

                    "vmla.f32   q6, q8, %e18[0]     \n"
                    "vmla.f32   q7, q8, %e21[0]     \n"

                    "pld        [%3, #128]          \n"
                    "vld1.f32   {d24-d25}, [%3]     \n"// _sum0n

                    "pld        [%4, #128]          \n"
                    "vld1.f32   {d26-d27}, [%4]     \n"// _sum1n

                    "vmla.f32   q12, q14, %e20[0]   \n"
                    "vmla.f32   q13, q14, %e23[0]   \n"

                    "vext.32    q8, q8, q9, #2      \n"
                    "vext.32    q9, q14, q15, #1    \n"

                    "vmla.f32   q6, q10, %e18[1]    \n"
                    "vmla.f32   q7, q10, %e21[1]    \n"
                    "vmla.f32   q12, q11, %f20[0]   \n"
                    "vmla.f32   q13, q11, %f23[0]   \n"

                    "pld        [%6, #192]          \n"
                    "vld1.f32   {d28-d30}, [%6]     \n"// r1
                    "add        %6, #16             \n"

                    "vmla.f32   q6, q8, %f18[0]     \n"
                    "vmla.f32   q7, q8, %f21[0]     \n"
                    "vmla.f32   q12, q9, %e20[1]    \n"
                    "vmla.f32   q13, q9, %e23[1]    \n"

                    "vext.32    q10, q14, q15, #1   \n"

                    "vmla.f32   q6, q14, %e19[0]    \n"
                    "vmla.f32   q7, q14, %e22[0]    \n"
                    "vmla.f32   q12, q14, %e18[0]   \n"
                    "vmla.f32   q13, q14, %e21[0]   \n"

                    "vext.32    q11, q14, q15, #2   \n"

                    "vmla.f32   q6, q10, %e19[1]    \n"
                    "vmla.f32   q7, q10, %e22[1]    \n"
                    "vmla.f32   q12, q10, %e18[1]   \n"
                    "vmla.f32   q13, q10, %e21[1]   \n"

                    "pld        [%7, #192]          \n"
                    "vld1.f32   {d16-d18}, [%7   ] \n"// r2
                    "add        %7, #16             \n"

                    "vmla.f32   q6, q11, %f19[0]    \n"
                    "vmla.f32   q7, q11, %f22[0]    \n"
                    "vmla.f32   q12, q11, %f18[0]   \n"
                    "vmla.f32   q13, q11, %f21[0]   \n"

                    "vext.32    q10, q8, q9, #1     \n"

                    "vmla.f32   q6, q8, %e20[0]     \n"
                    "vmla.f32   q7, q8, %e23[0]     \n"
                    "vmla.f32   q12, q8, %e19[0]    \n"
                    "vmla.f32   q13, q8, %e22[0]    \n"

                    "vext.32    q11, q8, q9, #2     \n"

                    "vmla.f32   q6, q10, %e20[1]    \n"
                    "vmla.f32   q7, q10, %e23[1]    \n"
                    "vmla.f32   q12, q10, %e19[1]   \n"
                    "vmla.f32   q13, q10, %e22[1]   \n"

                    "pld        [%5, #192]          \n"
                    "vld1.f32   {d16-d18}, [%5   ] \n"// r0
                    "add        %5, #16             \n"

                    "vmla.f32   q6, q11, %f20[0]    \n"
                    "vmla.f32   q7, q11, %f23[0]    \n"
                    "vmla.f32   q12, q11, %f19[0]   \n"
                    "vmla.f32   q13, q11, %f22[0]   \n"

                    "pld        [%8, #192]          \n"
                    "vld1.f32   {d28-d30}, [%8]     \n"// r3
                    "add        %8, #16             \n"

                    "vext.32    q10, q8, q9, #1     \n"

                    "vst1.f32   {d12-d13}, [%1 ]!\n"
                    "vst1.f32   {d14-d15}, [%2 ]!\n"

                    "vext.32    q11, q14, q15, #2   \n"

                    "vst1.f32   {d24-d25}, [%3]!    \n"
                    "vst1.f32   {d26-d27}, [%4]!    \n"

                    "subs       %0, #1              \n"
                    "bne        0b                  \n"

                    "sub        %5, #16             \n"
                    "sub        %8, #16             \n"
                    : "=r"(nn),         // %0
                      "=r"(outptr0),    // %1
                      "=r"(outptr1),    // %2
                      "=r"(outptr0n),   // %3
                      "=r"(outptr1n),   // %4
                      "=r"(r0),         // %5
                      "=r"(r1),         // %6
                      "=r"(r2),         // %7
                      "=r"(r3)          // %8
                    : "0"(nn),
                      "1"(outptr0),
                      "2"(outptr1),
                      "3"(outptr0n),
                      "4"(outptr1n),
                      "5"(r0),
                      "6"(r1),
                      "7"(r2),
                      "8"(r3),
                      "w"(_k00),      // %18
                      "w"(_k03),      // %19
                      "w"(_k06),      // %20
                      "w"(_k10),      // %21
                      "w"(_k13),      // %22
                      "w"(_k16)       // %23
                    : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );

1 个解决方案

#1


百度搜“gcc-inline-asm.pdf”?

#1


百度搜“gcc-inline-asm.pdf”?