Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions aws-neuronx-dkms-mkdeb/debian/prerm
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ set -e
case "$1" in
remove|upgrade|deconfigure)
if [ "`dkms status -m $NAME`" ]; then
# Check if module is loaded before attempting removal
if lsmod | grep -q "^neuron "; then
echo "Neuron module is currently loaded. Attempting to unload..."
if ! rmmod neuron 2>/dev/null; then
echo "ERROR: Cannot unload neuron module - it is currently in use." >&2
echo "Please stop all processes using the neuron module before uninstalling." >&2
exit 1
fi
fi
dkms remove -m $NAME -v $VERSION --all
fi
;;
Expand Down
9 changes: 9 additions & 0 deletions aws-neuronx-dkms-mkrpm.spec
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ exit 1
%preun
echo -e
echo -e "Uninstall of %{module_name} module (version %{version}) beginning:"
# Check if module is loaded before attempting removal
if lsmod | grep -q "^neuron "; then
echo "Neuron module is currently loaded. Attempting to unload..."
if ! rmmod neuron 2>/dev/null; then
echo "ERROR: Cannot unload neuron module - it is currently in use."
echo "Please stop all processes using the neuron module before uninstalling."
exit 1
fi
fi
dkms remove -m %{module_name} -v %{version} --all --rpm_safe_upgrade
exit 0

Expand Down
2 changes: 1 addition & 1 deletion dkms.conf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
PACKAGE_NAME=aws-neuronx
PACKAGE_VERSION=2.25.4.0
PACKAGE_VERSION=2.26.5.0
BUILT_MODULE_NAME[0]="neuron"
MAKE[0]="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build"
CLEAN="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build clean"
Expand Down
77 changes: 67 additions & 10 deletions neuron_cdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ static int ncdev_dma_queue_init_batch(struct neuron_device *nd, void *param)

ret = neuron_copy_from_user(__func__, arg, (struct neuron_ioctl_dma_queue_init_batch *)param, sizeof(struct neuron_ioctl_dma_queue_init_batch));
if (ret) {
return -EACCES;
ret = -EACCES;
goto done;
}

if (arg->count >= MAX_DMA_QUEUE_INIT_BATCH) {
Expand Down Expand Up @@ -1200,7 +1201,7 @@ static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, unsigned int cmd,
IS_ALIGNED(offset, 4);

// For smallish transfers, just do "copy from" directly to bar4
// simulation (inkling) does not have bar4 mapped to the actual memory, don't do it
// simulation does not have bar4 mapped to the actual memory, don't do it
if (use_bar4_wr) {
u64 cpy_offset;
ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + offset, size, &cpy_offset);
Expand Down Expand Up @@ -1371,7 +1372,7 @@ static int ncdev_mem_buf_zerocopy64_batch(struct neuron_device *nd, void *param)
}

// For smallish transfers, just do "copy from" directly to bar4
// simulation (inkling) does not have bar4 mapped to the actual memory, don't do it
// simulation does not have bar4 mapped to the actual memory, don't do it
if (use_bar4_wr) {
for (j = 0; j < batch->num_ops; j++) {
const nrt_tensor_batch_op_t op = batch->ops_ptr[j];
Expand All @@ -1397,12 +1398,14 @@ static int ncdev_mem_buf_zerocopy64_batch(struct neuron_device *nd, void *param)

if (!ndmar_qid_valid(qid)) {
pr_err("nd%02d: invalid h2t queue index %d", nd->device_index, qid);
return -ENOENT;
ret = -ENOENT;
goto cleanup;
}

if (!ndma_zerocopy_supported()) {
pr_err_once("nd%02d: zero copy is not supported for architectures requiring DMA retry", nd->device_index);
return -EINVAL;
ret = -EINVAL;
goto cleanup;
}

// use the zero-copy batch function for ops within a single batch
Expand Down Expand Up @@ -1480,9 +1483,25 @@ static long ncdev_bar_read(struct neuron_device *nd, u8 bar, u64 *reg_addresses,
int i;
if (bar == 0 || bar == 2) {
u32 *data = NULL;

for (i = 0; i < data_count; i++) {
u64 bar_addr = (u64)nd->npdev.bar0;
u64 bar_size = nd->npdev.bar0_size;

// From V2 arch onwards, nd->npdev.bar2 (axi_bar) is no longer initialized
// TODO: Remove bar2 fields

// nd->npdev.bar0 is initialized to APB bar
// On V2 arch, APB bar is 0 usually, but 2 in case of QEMU

if ((reg_addresses[i] < bar_addr) || (reg_addresses[i] >= bar_addr + bar_size)) {
return -EINVAL;
}
}
data = kmalloc(data_size, GFP_KERNEL);
if (data == NULL)
return -ENOMEM;

ret = ndhal->ndhal_reg_access.reg_read32_array((void **)reg_addresses, data, data_count);
if (ret) {
kfree(data);
Expand Down Expand Up @@ -1541,8 +1560,10 @@ static int ncdev_bar_write_data(struct neuron_device *nd, u8 bar, u64 *reg_addre
if (bar == 0) {
int i;
for (i = 0; i < data_count; i++) {
u64 off = reg_addresses[i] - (u64)nd->npdev.bar0;
if (off > nd->npdev.bar0_size) {
u64 bar_addr = (u64)nd->npdev.bar0;
u64 bar_size = nd->npdev.bar0_size;
u64 off = reg_addresses[i] - bar_addr;
if ((reg_addresses[i] < bar_addr) || (reg_addresses[i] >= bar_addr + bar_size)) {
return -EINVAL;
}
if (ndhal->ndhal_ndma.ndma_is_bar0_write_blocked(off)) {
Expand Down Expand Up @@ -2999,6 +3020,39 @@ static int ncdev_power_profile_set(struct neuron_device *nd, void *param)
return ndhal->ndhal_perf.perf_set_profile(nd, arg.profile);
}

static int ncdev_throttling_notifications_set(struct neuron_device *nd, void *param)
{
struct neuron_ioctl_throttling_notifications arg;
int ret;

ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_throttling_notifications*) param, sizeof(arg));
if (ret)
return ret;

return fw_io_enable_throttling_notifications(nd->fw_io_ctx, arg.enable ? true : false);
}

static int ncdev_get_va_placement(void *param)
{
struct neuron_ioctl_get_va_placement arg;
int ret, unused;
int device_index, hbm_index;
ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_get_va_placement*) param, sizeof(arg));
if (ret)
return ret;

ret = nmmap_get_va_placement((void*)arg.va, &device_index, &hbm_index);
if (!ret) {
arg.device_index = device_index;
arg.hbm_index = hbm_index;
} else {
arg.device_index = -1;
arg.hbm_index = -1;
}
unused = copy_to_user(param, &arg, sizeof(arg));
return ret;
}

inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsigned long param) {
if ((cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK) || (cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK_EXT0)) {
return ncdev_crwl_nc_range_mark(filep, cmd, (void *)param);
Expand Down Expand Up @@ -3031,6 +3085,8 @@ inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsign
return ncdev_pod_status(cmd, (void *)param);
} else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_POD_CTRL)) {
return ncdev_pod_ctrl(filep, cmd, (void *)param);
} else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_GET_VA_PLACEMENT)) {
return ncdev_get_va_placement((void *)param);
}

pr_err("invalid misc IOCTL %d (dir=%d, type=%d, nr=%d, size=%d)\n", cmd, _IOC_DIR(cmd),
Expand Down Expand Up @@ -3226,6 +3282,8 @@ static long ncdev_ioctl(struct file *filep, unsigned int cmd, unsigned long para
return ncdev_h2t_dma_free_queues(nd, cmd, (void*)param);
} else if (cmd == NEURON_IOCTL_POWER_PROFILE) {
return ncdev_power_profile_set(nd, (void*)param);
} else if (cmd == NEURON_IOCTL_THROTTLING_NOTIFICATIONS) {
return ncdev_throttling_notifications_set(nd, (void*)param);
}

// B/W compatibility
Expand Down Expand Up @@ -3500,6 +3558,8 @@ static inline int ncdev_init_device_node(struct ncdev *devnode, const char *dev_
return ret;
}
devnode->device = device;
devnode->minor = minor;
devnode->ndev = ndev;

ret = sysfs_create_group(&(device->kobj), &attr_group);
if (ret) {
Expand All @@ -3518,9 +3578,6 @@ static inline int ncdev_init_device_node(struct ncdev *devnode, const char *dev_
return -1;
}

devnode->minor = minor;
devnode->ndev = ndev;

return 0;
}

Expand Down
5 changes: 3 additions & 2 deletions neuron_dhal.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ struct ndhal_address_map {
uint64_t event_count;
uint32_t ts_per_device;
int dma_eng_per_nc;
int dma_eng_per_nd;
int seng_dma_eng_per_nd;
int h2d_dma_eng_per_nd;
int dram_channels;
};

Expand Down Expand Up @@ -280,4 +281,4 @@ int ndhal_register_funcs_v2(void);
int ndhal_register_funcs_v3(void);
int ndhal_register_funcs_v4(void);

#endif
#endif // #ifndef NEURON_DHAL_H
1 change: 1 addition & 0 deletions neuron_dma.c
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,7 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
break;
}
contig_size += PAGE_SIZE;
tmp = next;
}

if (dma_ctx->direction) { // write to device
Expand Down
60 changes: 53 additions & 7 deletions neuron_fw_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -233,14 +233,18 @@ static const u32 fw_io_cmd_timeout_tbl[FW_IO_CMD_MAX] = {
0, // cmd 0
(1000 * 1000 * 1), // cmd 1 (FW_IO_CMD_READ)
(1000 * 1000 * 1), // cmd 2 (FW_IO_CMD_POST_TO_CW)
(1000 * 1000 * 60) // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE)
(1000 * 1000 * 60), // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE)
(1000 * 1000 * 1), // cmd 4 (FW_IO_CMD_GET_DATA)
(1000 * 1000 * 60), // cmd 5 (FW_IO_CMD_SET_FEATURE)
};

static const u32 fw_io_cmd_retry_tbl[FW_IO_CMD_MAX] = {
0, // cmd 0
15, // cmd 1 (FW_IO_CMD_READ)
15, // cmd 2 (FW_IO_CMD_POST_TO_CW)
3 // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE)
3, // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE)
3, // cmd 4 (FW_IO_CMD_GET_DATA)
3, // cmd 5 (FW_IO_CMD_SET_FEATURE)
};

static u32 crc32c(const u8 *hdr, const u8 *data, size_t len)
Expand Down Expand Up @@ -352,7 +356,7 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re
ret = fw_io_api_version_read(ctx->bar0, &api_version_num);

if ((ret != 0) || (api_version_num < FW_IO_NEW_READLESS_READ_MIN_API_VERSION)) {
pr_info_once("Pacific version %d, using legacy Pacific/Runtime comm framework", api_version_num);
pr_info_once("Firmware version %d, using legacy Firmware/Runtime comm framework", api_version_num);
return -ENOTSUPP;
}

Expand Down Expand Up @@ -657,6 +661,7 @@ struct fw_io_ctx *fw_io_setup(void __iomem *bar0, u64 bar0_size,
ctx->next_seq_num = 1;
mutex_init(&ctx->lock);

ctx->request_response_size = FW_IO_MAX_SIZE;
ctx->request = kmalloc(FW_IO_MAX_SIZE, GFP_ATOMIC);
if (ctx->request == NULL) {
pr_err("memory allocation failed\n");
Expand Down Expand Up @@ -705,24 +710,41 @@ void fw_io_destroy(struct fw_io_ctx *ctx)
kfree(ctx);
}

uint32_t fw_io_get_total_uecc_err_count(void *bar0) {
static inline uint32_t uncorrectable_ecc_err_count(uint32_t api_version, uint32_t ecc_err_count) {
// API Version<6: bitfield[15:0] Uncorrectable Errors
// API Version>=6: bitfield[15:12] Uncorrectable Errors
return (api_version >= 6) ? ((ecc_err_count >> 12) & 0xf) : (ecc_err_count & 0xffff);
}

static inline uint32_t repairable_ecc_err_count(uint32_t api_version, uint32_t ecc_err_count) {
// API Version<6: N/A
// API Version>=6: bitfield[11:0] Repairable Errors
return (api_version >= 6) ? (ecc_err_count & 0xfff) : 0;
}

void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *ue_ecc_count, uint32_t *repairable_ecc_count) {
uint32_t total_uncorrected_ecc_err_count = 0;
uint32_t total_repairable_ecc_err_count = 0;
uint32_t channel = 0;
uint32_t ecc_err_count = 0;
uint64_t ecc_offset = 0;

uint32_t api_version;
fw_io_api_version_read(bar0, &api_version);

for (channel = 0; channel < ndhal->ndhal_address_map.dram_channels; channel++) {
ecc_offset = FW_IO_REG_HBM0_ECC_OFFSET + channel * sizeof(uint32_t);
ecc_err_count = 0;
int ret = fw_io_ecc_read(bar0, ecc_offset, &ecc_err_count);
if (ret) {
pr_err("sysfs failed to read ECC HBM%u error from FWIO\n", channel);
} else if (ecc_err_count != 0xdeadbeef) {
// ue count is in the lowest 16 bits
total_uncorrected_ecc_err_count += (ecc_err_count & 0x0000ffff);
total_uncorrected_ecc_err_count += uncorrectable_ecc_err_count(api_version, ecc_err_count);
total_repairable_ecc_err_count += repairable_ecc_err_count(api_version, ecc_err_count);
}
}
return total_uncorrected_ecc_err_count;
*ue_ecc_count = total_uncorrected_ecc_err_count;
*repairable_ecc_count = total_repairable_ecc_err_count;
}

int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile)
Expand All @@ -739,3 +761,27 @@ int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile)

return fw_io_execute_request_new(ctx, FW_IO_CMD_SET_POWER_PROFILE, (u8 *)&data, sizeof(data), NULL, 0);
}

int fw_io_enable_throttling_notifications(struct fw_io_ctx *ctx, bool enable)
{ /*
* Note:
* This implementation assumes throttling notifications is the only feature.
* Current behavior sets features=0x01 (enable) or features=0x00 (disable all)
* When more features are added, we need to:
* 1. Add a get feature command to read current feature state
* 2. Cache the current features bitmap in fw_io_ctx
* 3. Set/clear individual feature bits
* This will avoid wiping out other enabled features when disabling one feature
*/
u8 features = 0;

if (!ctx) {
return -EINVAL;
}

if (enable) {
features = FW_IO_FEATURE_THROTTLING_NOTIFICATIONS;
}

return fw_io_execute_request_new(ctx, FW_IO_CMD_SET_FEATURE, &features, sizeof(features), NULL, 0);
}
Loading