Skip to content

Commit 8f3a345

Browse files
authored
Updated to memmove support (#18)
* Modified support for memmove to allow user to select whether operations with overlapping buffers are performed entirely on CPU or on DSA Modified support for memmove to disregard memmove operations with overlapping buffers because they are not split between CPU and DSA Modified dsa_execute function to not call dsa_wait_and_adjust since it is called when the operation wasn't split and the autotune algorithm should not be used. Signed-off-by: Sydir, Jerry <jerry.sydir@intel.com> * Modified flow in dto_memcpymove function to avoid calling get_wq in the case where the overlapping memmove is performed entirely on CPU Signed-off-by: Sydir, Jerry <jerry.sydir@intel.com> * When memmove operations is overlapping, we set cpu fraction to 0. This is done at the top of the dto_memcpymove function. This was done a second time within the loop. This commit removes that second instance. Signed-off-by: Sydir, Jerry <jerry.sydir@intel.com> --------- Signed-off-by: Sydir, Jerry <jerry.sydir@intel.com>
1 parent 50f536d commit 8f3a345

File tree

2 files changed

+98
-53
lines changed

2 files changed

+98
-53
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ Following environment variables control the behavior of DTO library:
6363
DTO_DSA_MEMSET=0/1, 1 (default) - DTO uses DSA to process memset, 0 - DTO use system memset
6464
DTO_DSA_MEMCMP=0/1, 1 (default) - DTO uses DSA to process memcmp, 0 - DTO use system memcmp
6565
DTO_DSA_CC=0/1, 1 (default) - DTO sets DSA Cache Control flag to 1 if DSA supports cache control, 0 - DTO sets DSA Cache Control flag to 0
66+
DTO_OVERLAPPING_MEMMOVE_ACTION=0/1 0 (default) DTO submits memmove operations with overlapping buffers entirely to CPU, 1 - entirely to DSA
6667
DTO_UMWAIT_DELAY=xxxx defines delay for umwait command (check max possible value at: /sys/devices/system/cpu/umwait_control/max_time), default is 100000
6768
DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
6869
DTO_LOG_LEVEL=0/1/2 controls the log level. higher value means more verbose logging (default 0).

dto.c

Lines changed: 97 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@ enum numa_aware {
9797
NA_LAST_ENTRY
9898
};
9999

100+
enum overlapping_memmove_actions {
101+
OVERLAPPING_CPU = 0,
102+
OVERLAPPING_DSA,
103+
OVERLAPPING_LAST_ENTRY
104+
};
105+
100106
static const char * const numa_aware_names[] = {
101107
[NA_NONE] = "none",
102108
[NA_BUFFER_CENTRIC] = "buffer-centric",
@@ -137,6 +143,8 @@ static uint64_t tpause_wait_time = TPAUSE_C02_DELAY_NS;
137143

138144
static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT;
139145

146+
static uint8_t dto_overlapping_memmove_action = OVERLAPPING_CPU;
147+
140148
static uint8_t fork_handler_registered;
141149

142150
enum memop {
@@ -209,17 +217,17 @@ static struct timespec dto_start_time;
209217
} while (0) \
210218

211219

212-
#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, tbc, r) \
220+
#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, overlap, tbc, r) \
213221
do { \
214222
if (unlikely(cs)) { \
215223
uint64_t t; \
216224
clock_gettime(CLOCK_BOOTTIME, &et); \
217225
t = (((et.tv_sec*1000000000) + et.tv_nsec) - \
218226
((st.tv_sec*1000000000) + st.tv_nsec)); \
219227
if (unlikely(r != SUCCESS)) \
220-
update_stats(op, n, tbc, t, DSA_CALL_FAILED, r); \
228+
update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r); \
221229
else \
222-
update_stats(op, n, tbc, t, DSA_CALL_SUCCESS, 0); \
230+
update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0); \
223231
} \
224232
} while (0) \
225233

@@ -230,7 +238,7 @@ static struct timespec dto_start_time;
230238
clock_gettime(CLOCK_BOOTTIME, &et); \
231239
t = (((et.tv_sec*1000000000) + et.tv_nsec) - \
232240
((st.tv_sec*1000000000) + st.tv_nsec)); \
233-
update_stats(op, orig_n, n, t, STDC_CALL, 0); \
241+
update_stats(op, orig_n, false, n, t, STDC_CALL, 0); \
234242
} \
235243
} while (0) \
236244

@@ -537,10 +545,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
537545
ret = 0;
538546
}
539547
if (!ret) {
540-
if (auto_adjust_knobs)
541-
dsa_wait_and_adjust(comp);
542-
else
543-
dsa_wait_no_adjust(comp);
548+
dsa_wait_no_adjust(comp);
544549

545550
if (*comp == DSA_COMP_SUCCESS) {
546551
thr_bytes_completed += hw->xfer_size;
@@ -556,9 +561,14 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
556561
}
557562

558563
#ifdef DTO_STATS_SUPPORT
559-
static void update_stats(int op, size_t n, size_t bytes_completed,
564+
static void update_stats(int op, size_t n, bool overlapping, size_t bytes_completed,
560565
uint64_t elapsed_ns, int group, int error_code)
561566
{
567+
// dto_memcpymove didn't actually submit the request to DSA, so there is nothing to log. This will be captured by a second call
568+
if (op == MEMMOVE && overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS) {
569+
return;
570+
}
571+
562572
int bucket = (n / HIST_BUCKET_SIZE);
563573

564574
if (bucket >= HIST_NO_BUCKETS) /* last bucket includes remaining sizes */
@@ -1271,6 +1281,14 @@ static int init_dto(void)
12711281
dto_dsa_memcmp = !!dto_dsa_memcmp;
12721282
}
12731283

1284+
env_str = getenv("DTO_OVERLAPPING_MEMMOVE_ACTION");
1285+
if (env_str != NULL) {
1286+
errno = 0;
1287+
dto_overlapping_memmove_action = strtoul(env_str, NULL, 10);
1288+
if (errno)
1289+
dto_overlapping_memmove_action = OVERLAPPING_CPU;
1290+
}
1291+
12741292
#ifdef DTO_STATS_SUPPORT
12751293
env_str = getenv("DTO_COLLECT_STATS");
12761294
if (env_str != NULL) {
@@ -1536,55 +1554,74 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
15361554
return true;
15371555
}
15381556

1539-
static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
1557+
static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
15401558
{
1541-
struct dto_wq *wq = get_wq(dest);
1559+
struct dto_wq *wq;
15421560
size_t cpu_size, dsa_size;
1561+
bool is_overlapping;
15431562

1544-
thr_desc.opcode = DSA_OPCODE_MEMMOVE;
1545-
thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
1546-
if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
1547-
thr_desc.flags |= IDXD_OP_FLAG_CC;
1548-
thr_desc.completion_addr = (uint64_t)&thr_comp;
1563+
thr_bytes_completed = 0;
15491564

1550-
/* cpu_size_fraction guaranteed to be >= 0 and < 1 */
1551-
if (!is_memcpy && is_overlapping_buffers(dest, src, n))
1565+
if (!is_memcpy && is_overlapping_buffers(dest, src, n)) {
15521566
cpu_size = 0;
1553-
else
1567+
is_overlapping = true;
1568+
} else {
1569+
/* cpu_size_fraction guaranteed to be >= 0 and < 1 */
15541570
cpu_size = n * cpu_size_fraction / 100;
1571+
is_overlapping = false;
1572+
}
1573+
1574+
// If this is an overlapping memmove and the action is to perform on CPU, return having done nothing and
1575+
// memmove will perform the copy and correctly attribute statistics to stdlib call group
1576+
if (is_overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU) {
1577+
*result = SUCCESS;
1578+
return true;
1579+
}
15551580

15561581
dsa_size = n - cpu_size;
1582+
wq = get_wq(dest);
15571583

1558-
thr_bytes_completed = 0;
1584+
thr_desc.opcode = DSA_OPCODE_MEMMOVE;
1585+
thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
1586+
if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
1587+
thr_desc.flags |= IDXD_OP_FLAG_CC;
1588+
thr_desc.completion_addr = (uint64_t)&thr_comp;
15591589

15601590
if (dsa_size <= wq->max_transfer_size) {
15611591
thr_desc.src_addr = (uint64_t) src + cpu_size;
15621592
thr_desc.dst_addr = (uint64_t) dest + cpu_size;
15631593
thr_desc.xfer_size = (uint32_t) dsa_size;
15641594
thr_comp.status = 0;
1565-
*result = dsa_submit(wq, &thr_desc);
1566-
if (*result == SUCCESS) {
1567-
if (cpu_size) {
1568-
if (is_memcpy)
1569-
orig_memcpy(dest, src, cpu_size);
1570-
else
1571-
orig_memmove(dest, src, cpu_size);
1572-
thr_bytes_completed += cpu_size;
1595+
if (is_overlapping) {
1596+
*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
1597+
} else {
1598+
*result = dsa_submit(wq, &thr_desc);
1599+
if (*result == SUCCESS) {
1600+
if (cpu_size) {
1601+
if (is_memcpy)
1602+
orig_memcpy(dest, src, cpu_size);
1603+
else
1604+
orig_memmove(dest, src, cpu_size);
1605+
thr_bytes_completed += cpu_size;
1606+
}
1607+
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
15731608
}
1574-
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
15751609
}
15761610
} else {
15771611
uint32_t threshold;
15781612
size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm
1579-
threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
1613+
if (is_overlapping) {
1614+
threshold = wq->max_transfer_size;
1615+
} else {
1616+
threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
1617+
}
1618+
15801619
do {
15811620
size_t len;
15821621

15831622
len = n <= threshold ? n : threshold;
15841623

1585-
if (!is_memcpy && is_overlapping_buffers(dest, src, len))
1586-
cpu_size = 0;
1587-
else
1624+
if (!is_overlapping)
15881625
cpu_size = len * current_cpu_size_fraction / 100;
15891626

15901627
dsa_size = len - cpu_size;
@@ -1593,30 +1630,36 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
15931630
thr_desc.dst_addr = (uint64_t) dest + cpu_size + thr_bytes_completed;
15941631
thr_desc.xfer_size = (uint32_t) dsa_size;
15951632
thr_comp.status = 0;
1596-
*result = dsa_submit(wq, &thr_desc);
1597-
if (*result == SUCCESS) {
1598-
if (cpu_size) {
1599-
const void *src1 = src + thr_bytes_completed;
1600-
void *dest1 = dest + thr_bytes_completed;
1601-
1602-
if (is_memcpy)
1603-
orig_memcpy(dest1, src1, cpu_size);
1604-
else
1605-
orig_memmove(dest1, src1, cpu_size);
1606-
thr_bytes_completed += cpu_size;
1633+
if (is_overlapping){
1634+
*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
1635+
} else {
1636+
*result = dsa_submit(wq, &thr_desc);
1637+
if (*result == SUCCESS) {
1638+
if (cpu_size) {
1639+
const void *src1 = src + thr_bytes_completed;
1640+
void *dest1 = dest + thr_bytes_completed;
1641+
1642+
if (is_memcpy)
1643+
orig_memcpy(dest1, src1, cpu_size);
1644+
else
1645+
orig_memmove(dest1, src1, cpu_size);
1646+
thr_bytes_completed += cpu_size;
1647+
}
1648+
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
16071649
}
1608-
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
16091650
}
16101651

16111652
if (*result != SUCCESS)
16121653
break;
16131654
n -= len;
16141655
/* If remaining bytes are less than dsa_min_size,
1615-
* dont submit to DSA. Instead, complete remaining
1616-
* bytes on CPU
1617-
*/
1656+
* dont submit to DSA. Instead, complete remaining
1657+
* bytes on CPU
1658+
*/
16181659
} while (n >= dsa_min_size);
16191660
}
1661+
1662+
return is_overlapping;
16201663
}
16211664

16221665
static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result)
@@ -1746,7 +1789,7 @@ void *memset(void *s1, int c, size_t n)
17461789
dto_memset(s1, c, n, &result);
17471790

17481791
#ifdef DTO_STATS_SUPPORT
1749-
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, thr_bytes_completed, result);
1792+
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result);
17501793
#endif
17511794
if (thr_bytes_completed != n) {
17521795
/* fallback to std call if job is only partially completed */
@@ -1796,7 +1839,7 @@ void *memcpy(void *dest, const void *src, size_t n)
17961839
dto_memcpymove(dest, src, n, 1, &result);
17971840

17981841
#ifdef DTO_STATS_SUPPORT
1799-
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, thr_bytes_completed, result);
1842+
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result);
18001843
#endif
18011844
if (thr_bytes_completed != n) {
18021845
/* fallback to std call if job is only partially completed */
@@ -1828,6 +1871,7 @@ void *memmove(void *dest, const void *src, size_t n)
18281871
int result = 0;
18291872
void *ret = dest;
18301873
int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memmove);
1874+
bool is_overlapping;
18311875
#ifdef DTO_STATS_SUPPORT
18321876
struct timespec st, et;
18331877
size_t orig_n = n;
@@ -1846,10 +1890,10 @@ void *memmove(void *dest, const void *src, size_t n)
18461890
#ifdef DTO_STATS_SUPPORT
18471891
DTO_COLLECT_STATS_START(collect_stats, st);
18481892
#endif
1849-
dto_memcpymove(dest, src, n, 0, &result);
1893+
is_overlapping = dto_memcpymove(dest, src, n, 0, &result);
18501894

18511895
#ifdef DTO_STATS_SUPPORT
1852-
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, thr_bytes_completed, result);
1896+
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result);
18531897
#endif
18541898
if (thr_bytes_completed != n) {
18551899
/* fallback to std call if job is only partially completed */
@@ -1902,7 +1946,7 @@ int memcmp(const void *s1, const void *s2, size_t n)
19021946
ret = dto_memcmp(s1, s2, n, &result);
19031947

19041948
#ifdef DTO_STATS_SUPPORT
1905-
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, thr_bytes_completed, result);
1949+
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result);
19061950
#endif
19071951
if (thr_bytes_completed != n) {
19081952
/* fallback to std call if job is only partially completed */

0 commit comments

Comments
 (0)