Commit ac1d06d

committed

ximgproc: optimize add_mul using NEON intrinsics for ARM64

1 parent ea9f108 commit ac1d06dCopy full SHA for ac1d06d

File tree

+27

-0

lines changed

+27

-0

lines changed

Lines changed: 27 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,19 @@ inline bool CPU_SUPPORT_SSE1()`
`60`	`60`	`} // end`
`61`	`61`	`#endif`
`62`	`62`
	`63`	`+#if CV_NEON`
	`64`	`+namespace`
	`65`	`+{`
	`66`	`+`
	`67`	`+inline bool CPU_SUPPORT_NEON()`
	`68`	`+{`
	`69`	`+ static const bool is_supported = cv::checkHardwareSupport(CV_CPU_NEON);`
	`70`	`+ return is_supported;`
	`71`	`+}`
	`72`	`+`
	`73`	`+} // end`
	`74`	`+#endif`
	`75`	`+`
`63`	`76`	`namespace cv`
`64`	`77`	`{`
`65`	`78`	`namespace ximgproc`
`@@ -288,6 +301,20 @@ void add_mul(float dst, float src1, float *src2, int w)`
`288`	`301`	`_mm_storeu_ps(dst + j, c);`
`289`	`302`	`}`
`290`	`303`	`}`
	`304`	`+#elif CV_NEON`
	`305`	`+ if (CPU_SUPPORT_NEON())`
	`306`	`+ {`
	`307`	`+ float32x4_t a, b, c;`
	`308`	`+ for (; j < w - 3; j += 4)`
	`309`	`+ {`
	`310`	`+ a = vld1q_f32(src1 + j);`
	`311`	`+ b = vld1q_f32(src2 + j);`
	`312`	`+ b = vmulq_f32(b, a);`
	`313`	`+ c = vld1q_f32(dst + j);`
	`314`	`+ c = vaddq_f32(c, b);`
	`315`	`+ vst1q_f32(dst + j, c);`
	`316`	`+ }`
	`317`	`+ }`
`291`	`318`	`#endif`
`292`	`319`	`for (; j < w; j++)`
`293`	`320`	`{`

Comments

(0)