@@ -97,6 +97,12 @@ enum numa_aware {
9797 NA_LAST_ENTRY
9898};
9999
100+ enum overlapping_memmove_actions {
101+ OVERLAPPING_CPU = 0 ,
102+ OVERLAPPING_DSA ,
103+ OVERLAPPING_LAST_ENTRY
104+ };
105+
100106static const char * const numa_aware_names [] = {
101107 [NA_NONE ] = "none" ,
102108 [NA_BUFFER_CENTRIC ] = "buffer-centric" ,
@@ -137,6 +143,8 @@ static uint64_t tpause_wait_time = TPAUSE_C02_DELAY_NS;
137143
138144static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT ;
139145
146+ static uint8_t dto_overlapping_memmove_action = OVERLAPPING_CPU ;
147+
140148static uint8_t fork_handler_registered ;
141149
142150enum memop {
@@ -209,17 +217,17 @@ static struct timespec dto_start_time;
209217 } while (0) \
210218
211219
212- #define DTO_COLLECT_STATS_DSA_END (cs , st , et , op , n , tbc , r ) \
220+ #define DTO_COLLECT_STATS_DSA_END (cs , st , et , op , n , overlap , tbc , r ) \
213221 do { \
214222 if (unlikely(cs)) { \
215223 uint64_t t; \
216224 clock_gettime(CLOCK_BOOTTIME, &et); \
217225 t = (((et.tv_sec*1000000000) + et.tv_nsec) - \
218226 ((st.tv_sec*1000000000) + st.tv_nsec)); \
219227 if (unlikely(r != SUCCESS)) \
220- update_stats(op, n, tbc, t, DSA_CALL_FAILED, r); \
228+ update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r); \
221229 else \
222- update_stats(op, n, tbc, t, DSA_CALL_SUCCESS, 0); \
230+ update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0); \
223231 } \
224232 } while (0) \
225233
@@ -230,7 +238,7 @@ static struct timespec dto_start_time;
230238 clock_gettime(CLOCK_BOOTTIME, &et); \
231239 t = (((et.tv_sec*1000000000) + et.tv_nsec) - \
232240 ((st.tv_sec*1000000000) + st.tv_nsec)); \
233- update_stats(op, orig_n, n, t, STDC_CALL, 0); \
241+ update_stats(op, orig_n, false, n, t, STDC_CALL, 0); \
234242 } \
235243 } while (0) \
236244
@@ -537,10 +545,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
537545 ret = 0 ;
538546 }
539547 if (!ret ) {
540- if (auto_adjust_knobs )
541- dsa_wait_and_adjust (comp );
542- else
543- dsa_wait_no_adjust (comp );
548+ dsa_wait_no_adjust (comp );
544549
545550 if (* comp == DSA_COMP_SUCCESS ) {
546551 thr_bytes_completed += hw -> xfer_size ;
@@ -556,9 +561,14 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
556561}
557562
558563#ifdef DTO_STATS_SUPPORT
559- static void update_stats (int op , size_t n , size_t bytes_completed ,
564+ static void update_stats (int op , size_t n , bool overlapping , size_t bytes_completed ,
560565 uint64_t elapsed_ns , int group , int error_code )
561566{
567+ // dto_memcpymove didn't actually submit the request to DSA, so there is nothing to log. This will be captured by a second call
568+ if (op == MEMMOVE && overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS ) {
569+ return ;
570+ }
571+
562572 int bucket = (n / HIST_BUCKET_SIZE );
563573
564574 if (bucket >= HIST_NO_BUCKETS ) /* last bucket includes remaining sizes */
@@ -1271,6 +1281,14 @@ static int init_dto(void)
12711281 dto_dsa_memcmp = !!dto_dsa_memcmp ;
12721282 }
12731283
1284+ env_str = getenv ("DTO_OVERLAPPING_MEMMOVE_ACTION" );
1285+ if (env_str != NULL ) {
1286+ errno = 0 ;
1287+ dto_overlapping_memmove_action = strtoul (env_str , NULL , 10 );
1288+ if (errno )
1289+ dto_overlapping_memmove_action = OVERLAPPING_CPU ;
1290+ }
1291+
12741292#ifdef DTO_STATS_SUPPORT
12751293 env_str = getenv ("DTO_COLLECT_STATS" );
12761294 if (env_str != NULL ) {
@@ -1536,55 +1554,74 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
15361554 return true;
15371555}
15381556
1539- static void dto_memcpymove (void * dest , const void * src , size_t n , bool is_memcpy , int * result )
1557+ static bool dto_memcpymove (void * dest , const void * src , size_t n , bool is_memcpy , int * result )
15401558{
1541- struct dto_wq * wq = get_wq ( dest ) ;
1559+ struct dto_wq * wq ;
15421560 size_t cpu_size , dsa_size ;
1561+ bool is_overlapping ;
15431562
1544- thr_desc .opcode = DSA_OPCODE_MEMMOVE ;
1545- thr_desc .flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR ;
1546- if (dto_dsa_cc && (wq -> dsa_gencap & GENCAP_CC_MEMORY ))
1547- thr_desc .flags |= IDXD_OP_FLAG_CC ;
1548- thr_desc .completion_addr = (uint64_t )& thr_comp ;
1563+ thr_bytes_completed = 0 ;
15491564
1550- /* cpu_size_fraction guaranteed to be >= 0 and < 1 */
1551- if (!is_memcpy && is_overlapping_buffers (dest , src , n ))
1565+ if (!is_memcpy && is_overlapping_buffers (dest , src , n )) {
15521566 cpu_size = 0 ;
1553- else
1567+ is_overlapping = true;
1568+ } else {
1569+ /* cpu_size_fraction guaranteed to be >= 0 and < 1 */
15541570 cpu_size = n * cpu_size_fraction / 100 ;
1571+ is_overlapping = false;
1572+ }
1573+
1574+ // If this is an overlapping memmove and the action is to perform on CPU, return having done nothing and
1575+ // memmove will perform the copy and correctly attribute statistics to stdlib call group
1576+ if (is_overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU ) {
1577+ * result = SUCCESS ;
1578+ return true;
1579+ }
15551580
15561581 dsa_size = n - cpu_size ;
1582+ wq = get_wq (dest );
15571583
1558- thr_bytes_completed = 0 ;
1584+ thr_desc .opcode = DSA_OPCODE_MEMMOVE ;
1585+ thr_desc .flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR ;
1586+ if (dto_dsa_cc && (wq -> dsa_gencap & GENCAP_CC_MEMORY ))
1587+ thr_desc .flags |= IDXD_OP_FLAG_CC ;
1588+ thr_desc .completion_addr = (uint64_t )& thr_comp ;
15591589
15601590 if (dsa_size <= wq -> max_transfer_size ) {
15611591 thr_desc .src_addr = (uint64_t ) src + cpu_size ;
15621592 thr_desc .dst_addr = (uint64_t ) dest + cpu_size ;
15631593 thr_desc .xfer_size = (uint32_t ) dsa_size ;
15641594 thr_comp .status = 0 ;
1565- * result = dsa_submit (wq , & thr_desc );
1566- if (* result == SUCCESS ) {
1567- if (cpu_size ) {
1568- if (is_memcpy )
1569- orig_memcpy (dest , src , cpu_size );
1570- else
1571- orig_memmove (dest , src , cpu_size );
1572- thr_bytes_completed += cpu_size ;
1595+ if (is_overlapping ) {
1596+ * result = dsa_execute (wq , & thr_desc , & thr_comp .status );
1597+ } else {
1598+ * result = dsa_submit (wq , & thr_desc );
1599+ if (* result == SUCCESS ) {
1600+ if (cpu_size ) {
1601+ if (is_memcpy )
1602+ orig_memcpy (dest , src , cpu_size );
1603+ else
1604+ orig_memmove (dest , src , cpu_size );
1605+ thr_bytes_completed += cpu_size ;
1606+ }
1607+ * result = dsa_wait (wq , & thr_desc , & thr_comp .status );
15731608 }
1574- * result = dsa_wait (wq , & thr_desc , & thr_comp .status );
15751609 }
15761610 } else {
15771611 uint32_t threshold ;
15781612 size_t current_cpu_size_fraction = cpu_size_fraction ; // the cpu_size_fraction might be changed by the auto tune algorithm
1579- threshold = wq -> max_transfer_size * 100 / (100 - current_cpu_size_fraction );
1613+ if (is_overlapping ) {
1614+ threshold = wq -> max_transfer_size ;
1615+ } else {
1616+ threshold = wq -> max_transfer_size * 100 / (100 - current_cpu_size_fraction );
1617+ }
1618+
15801619 do {
15811620 size_t len ;
15821621
15831622 len = n <= threshold ? n : threshold ;
15841623
1585- if (!is_memcpy && is_overlapping_buffers (dest , src , len ))
1586- cpu_size = 0 ;
1587- else
1624+ if (!is_overlapping )
15881625 cpu_size = len * current_cpu_size_fraction / 100 ;
15891626
15901627 dsa_size = len - cpu_size ;
@@ -1593,30 +1630,36 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
15931630 thr_desc .dst_addr = (uint64_t ) dest + cpu_size + thr_bytes_completed ;
15941631 thr_desc .xfer_size = (uint32_t ) dsa_size ;
15951632 thr_comp .status = 0 ;
1596- * result = dsa_submit (wq , & thr_desc );
1597- if (* result == SUCCESS ) {
1598- if (cpu_size ) {
1599- const void * src1 = src + thr_bytes_completed ;
1600- void * dest1 = dest + thr_bytes_completed ;
1601-
1602- if (is_memcpy )
1603- orig_memcpy (dest1 , src1 , cpu_size );
1604- else
1605- orig_memmove (dest1 , src1 , cpu_size );
1606- thr_bytes_completed += cpu_size ;
1633+ if (is_overlapping ){
1634+ * result = dsa_execute (wq , & thr_desc , & thr_comp .status );
1635+ } else {
1636+ * result = dsa_submit (wq , & thr_desc );
1637+ if (* result == SUCCESS ) {
1638+ if (cpu_size ) {
1639+ const void * src1 = src + thr_bytes_completed ;
1640+ void * dest1 = dest + thr_bytes_completed ;
1641+
1642+ if (is_memcpy )
1643+ orig_memcpy (dest1 , src1 , cpu_size );
1644+ else
1645+ orig_memmove (dest1 , src1 , cpu_size );
1646+ thr_bytes_completed += cpu_size ;
1647+ }
1648+ * result = dsa_wait (wq , & thr_desc , & thr_comp .status );
16071649 }
1608- * result = dsa_wait (wq , & thr_desc , & thr_comp .status );
16091650 }
16101651
16111652 if (* result != SUCCESS )
16121653 break ;
16131654 n -= len ;
16141655 /* If remaining bytes are less than dsa_min_size,
1615- * dont submit to DSA. Instead, complete remaining
1616- * bytes on CPU
1617- */
1656+ * dont submit to DSA. Instead, complete remaining
1657+ * bytes on CPU
1658+ */
16181659 } while (n >= dsa_min_size );
16191660 }
1661+
1662+ return is_overlapping ;
16201663}
16211664
16221665static int dto_memcmp (const void * s1 , const void * s2 , size_t n , int * result )
@@ -1746,7 +1789,7 @@ void *memset(void *s1, int c, size_t n)
17461789 dto_memset (s1 , c , n , & result );
17471790
17481791#ifdef DTO_STATS_SUPPORT
1749- DTO_COLLECT_STATS_DSA_END (collect_stats , st , et , MEMSET , n , thr_bytes_completed , result );
1792+ DTO_COLLECT_STATS_DSA_END (collect_stats , st , et , MEMSET , n , false, thr_bytes_completed , result );
17501793#endif
17511794 if (thr_bytes_completed != n ) {
17521795 /* fallback to std call if job is only partially completed */
@@ -1796,7 +1839,7 @@ void *memcpy(void *dest, const void *src, size_t n)
17961839 dto_memcpymove (dest , src , n , 1 , & result );
17971840
17981841#ifdef DTO_STATS_SUPPORT
1799- DTO_COLLECT_STATS_DSA_END (collect_stats , st , et , MEMCOPY , n , thr_bytes_completed , result );
1842+ DTO_COLLECT_STATS_DSA_END (collect_stats , st , et , MEMCOPY , n , false, thr_bytes_completed , result );
18001843#endif
18011844 if (thr_bytes_completed != n ) {
18021845 /* fallback to std call if job is only partially completed */
@@ -1828,6 +1871,7 @@ void *memmove(void *dest, const void *src, size_t n)
18281871 int result = 0 ;
18291872 void * ret = dest ;
18301873 int use_orig_func = USE_ORIG_FUNC (n , dto_dsa_memmove );
1874+ bool is_overlapping ;
18311875#ifdef DTO_STATS_SUPPORT
18321876 struct timespec st , et ;
18331877 size_t orig_n = n ;
@@ -1846,10 +1890,10 @@ void *memmove(void *dest, const void *src, size_t n)
18461890#ifdef DTO_STATS_SUPPORT
18471891 DTO_COLLECT_STATS_START (collect_stats , st );
18481892#endif
1849- dto_memcpymove (dest , src , n , 0 , & result );
1893+ is_overlapping = dto_memcpymove (dest , src , n , 0 , & result );
18501894
18511895#ifdef DTO_STATS_SUPPORT
1852- DTO_COLLECT_STATS_DSA_END (collect_stats , st , et , MEMMOVE , n , thr_bytes_completed , result );
1896+ DTO_COLLECT_STATS_DSA_END (collect_stats , st , et , MEMMOVE , n , is_overlapping , thr_bytes_completed , result );
18531897#endif
18541898 if (thr_bytes_completed != n ) {
18551899 /* fallback to std call if job is only partially completed */
@@ -1902,7 +1946,7 @@ int memcmp(const void *s1, const void *s2, size_t n)
19021946 ret = dto_memcmp (s1 , s2 , n , & result );
19031947
19041948#ifdef DTO_STATS_SUPPORT
1905- DTO_COLLECT_STATS_DSA_END (collect_stats , st , et , MEMCMP , n , thr_bytes_completed , result );
1949+ DTO_COLLECT_STATS_DSA_END (collect_stats , st , et , MEMCMP , n , false, thr_bytes_completed , result );
19061950#endif
19071951 if (thr_bytes_completed != n ) {
19081952 /* fallback to std call if job is only partially completed */
0 commit comments