/*****************************************************************************\
 *  gres_hca.c - Support HCAs as a generic resources.
 *****************************************************************************
 *  Copyright (C) 2021 NEC Corporation.
 *  Based upon gres_gpu.c with the copyright notice shown below:
 *  Copyright (C) 2010 Lawrence Livermore National Security.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Morris Jette <jette1@llnl.gov>
 *
 *  This file is part of Slurm, a resource management program.
 *  For details, see <https://slurm.schedmd.com/>.
 *  Please also read the included file: DISCLAIMER.
 *
 *  Slurm is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and
 *  distribute linked combinations including the two. You must obey the GNU
 *  General Public License in all respects for all of the code used other than
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
 *  exception to your version of the file(s), but you are not obligated to do
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in
 *  the program, then also delete it here.
 *
 *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with Slurm; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/

#include <ctype.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"

#include "src/common/slurm_xlator.h"
#include "src/common/bitstring.h"
#include "src/common/env.h"
#include "src/common/gres.h"
#include "src/common/list.h"
#include "src/common/xstring.h"

#include "../common/gres_common.h"

/*
 * These variables are required by the generic plugin interface.  If they
 * are not found in the plugin, the plugin loader will ignore it.
 *
 * plugin_name - A string giving a human-readable description of the
 * plugin.  There is no maximum length, but the symbol must refer to
 * a valid string.
 *
 * plugin_type - A string suggesting the type of the plugin or its
 * applicability to a particular form of data or method of data handling.
 * If the low-level plugin API is used, the contents of this string are
 * unimportant and may be anything.  Slurm uses the higher-level plugin
 * interface which requires this string to be of the form
 *
 *	<application>/<method>
 *
 * where <application> is a description of the intended application of
 * the plugin (e.g., "auth" for Slurm authentication) and <method> is a
 * description of how this plugin satisfies that application.  Slurm will
 * only load authentication plugins if the plugin_type string has a prefix
 * of "auth/".
 *
 *  * plugin_version - an unsigned 32-bit integer containing the Slurm version
 * (major.minor.micro combined into a single number).
 */
const char	plugin_name[]		= "Gres HCA plugin";
const char	plugin_type[]		= "gres/hca";
const uint32_t	plugin_version		= SLURM_VERSION_NUMBER;

static char	gres_name[]		= "hca";
static char	spooldir[]		= "/var/spool/";
static char	hca_devinfo_name[]	= "hca_devinfo_";
static char	all_hcalist_name[]	= "all_hcalist";

static List gres_devices = NULL;
static char *gres_hcalist = NULL;

static char *_get_slurmd_spooldir(void)
{
	char *slurmd_spooldir = NULL;
	size_t spooldir_len = strlen(spooldir);

	slurmd_spooldir = xstrdup(slurm_conf.slurmd_spooldir);
	if (!slurmd_spooldir) {
		error("%s: fail to duplicate slurmd_spooldir", __func__);
		return NULL;
	}

	/*
	 * Do not write to any directory other than slurmd spool directory
	 * (the default value is "/var/spool/slurmd").
	 */
	if ((xstrncmp(slurmd_spooldir, spooldir, spooldir_len) != 0) ||
		(strlen(slurmd_spooldir) <= spooldir_len)) {
		error("%s: inappropriate directory: %s",
			__func__, slurmd_spooldir);
		xfree(slurmd_spooldir);
		return NULL;
	}

	debug("%s: slurmd_spooldir: %s", __func__, slurmd_spooldir);
	return slurmd_spooldir;
}

/*
 * Write HCA device information for Aurora system
 * to slurmd spool directory.
 */
static void _write_hca_devinfo(void)
{
	char *device_path = NULL, *device_name = NULL, *file_path = NULL;
	const char *device_cmd = "/usr/bin/ibv_devinfo -d ";
	char *command = NULL, *slurmd_spooldir = NULL;
	ListIterator iter;
	gres_device_t *gres_device;
	FILE *fp = NULL;
	char line[BUF_SIZE];
	const char *search_word = "mlx5_";
	size_t search_len = strlen(search_word);
	struct stat stat_buf;

	if (!gres_devices) {
		error("%s: no device data", __func__);
		return;
	}

	slurmd_spooldir = _get_slurmd_spooldir();
	if (!slurmd_spooldir) {
		error("%s: fail to get slurmd_spooldir", __func__);
		return;
	}
	debug("%s: slurmd_spooldir: %s", __func__, slurmd_spooldir);

	device_name = line;
	iter = list_iterator_create(gres_devices);
	while ((gres_device = list_next(iter))) {
		device_path = xstrdup(gres_device->path);
		if (!device_path)
			continue;

		if ((fp = fopen(device_path, "r")) == NULL) {
			error("%s: fopen(%s): %m", __func__, device_path);
			xfree(device_path);
			continue;
		}
		debug("%s: device_path: %s", __func__, gres_device->path);

		while (fgets(line, sizeof(line), fp)) {
			if (xstrncmp(line, search_word, search_len) != 0)
				continue;

			xstrsubstitute(device_name, strchr(line, '\n'), "\0");
			xstrfmtcat(file_path, "%s/%s%s",
				slurmd_spooldir, hca_devinfo_name, device_name);
			xstrfmtcat(command, "%s%s > %s",
				device_cmd, device_name, file_path);
			debug("%s: file_path: %s", __func__, file_path);
			debug("%s: command: %s", __func__, command);

			/*
			 * Write result of ibv_devinfo command
			 * as the HCA device information.
			 */
			if (system(command))
				error("%s: system(%s): %m", __func__, command);
			else {
				if (stat(file_path, &stat_buf) < 0)
					error("%s: stat(%s)",
						__func__, file_path);
			}
			xfree(file_path);
			xfree(command);
			break;
		}
		fclose(fp);
		xfree(device_path);
	}
	list_iterator_destroy(iter);
	xfree(slurmd_spooldir);

	return;
}

/*
 * Read HCA port number from HCA device information.
 */
static int _read_hca_portnum(char *device_name)
{
	char *slurmd_spooldir = NULL, *file_path = NULL;
	char *strptr = NULL, *startp = NULL, *endp = NULL;
	int port_num = -1;
	struct stat stat_buf;
	FILE *fp = NULL;
	char line[BUF_SIZE];

	debug("%s: device_name: %s", __func__, device_name);

	slurmd_spooldir = _get_slurmd_spooldir();
	if (!slurmd_spooldir) {
		error("%s: fail to get slurmd_spooldir", __func__);
		return -1;
	}
	debug("%s: slurmd_spooldir: %s", __func__, slurmd_spooldir);

	xstrfmtcat(file_path, "%s/%s%s",
		slurmd_spooldir, hca_devinfo_name, device_name);
	xfree(slurmd_spooldir);

	if (stat(file_path, &stat_buf) < 0) {
		error("%s: stat(%s)", __func__, file_path);
		xfree(file_path);
		return -1;
	}

	if ((fp = fopen(file_path, "r")) == NULL) {
		error("%s: fopen(%s): %m", __func__, file_path);
		xfree(file_path);
		return -1;
	}
	debug("%s: file_path: %s", __func__, file_path);
	xfree(file_path);

	/*
	 * Read HCA port number from HCA device information.
	 * The HCA device information for Aurora system are written
	 * to a file in slurmd spool directory.
	 */
	while (fgets(line, sizeof(line), fp)) {
		if (xstrstr(line, "port:") == NULL)
			continue;
		if (xstrstr(line, "transport:") != NULL)
			continue;
		strptr = xstrdup(line);
		startp = strptr;
		do {
			port_num = strtoul(startp, &endp, 10);
			if (startp != endp) {
				debug("%s: found port_num", __func__);
				break;
			}
			if (*endp != '\0')
				startp = (endp + 1);
		} while((*endp != '\0'));
		if (startp == endp) {
			port_num = -1;
			debug("%s: not found port_num", __func__);
		}
		xfree(strptr);
		break;
	}
	fclose(fp);

	debug("%s: port_num: %d", __func__, port_num);
	return port_num;
}

/*
 * Create set of device name and port number as HCA list.
 */
static void _create_hca_list(int device_num, char *prefix)
{
	char *device_path = NULL, *device_name = NULL;
	int port_num;
	gres_device_t *gres_device;
	ListIterator iter;
	FILE *fp = NULL;
	char line[BUF_SIZE];
	const char *search_word = "mlx5_";
	size_t search_len = strlen(search_word);

	debug("%s: device_num: %d", __func__, device_num);
	device_name = line;
	iter = list_iterator_create(gres_devices);
	while ((gres_device = list_next(iter))) {
		if ((device_num != -1) && (device_num != gres_device->dev_num))
			continue;
		device_path = xstrdup(gres_device->path);
		if ((fp = fopen(device_path, "r")) == NULL) {
			error("%s: fopen(%s): %m", __func__, device_path);
			xfree(device_path);
			break;
		}
		debug("%s: device_path: %s", __func__, device_path);
		/*
		 * Set of device name and port number are created in
		 * following format.
		 * <device-name>:<port-number>[,<device-name>:<port-number>…]
		 * ex) mlx5_0:1,mlx5_1:1
		 */
		while (fgets(line, sizeof(line), fp)) {
			if (xstrncmp(line, search_word, search_len) != 0)
				continue;
			xstrsubstitute(device_name, strchr(line, '\n'), "\0");
			// Read HCA port number from HCA device information.
			port_num = _read_hca_portnum(device_name);
			if (port_num == -1)
				xstrfmtcat(gres_hcalist, "%s%s",
					prefix, device_name);
			else
				xstrfmtcat(gres_hcalist, "%s%s:%d",
					prefix, device_name, port_num);
			debug("%s: gres_hcalist: %s", __func__, gres_hcalist);
			break;
		}
		fclose(fp);
		xfree(device_path);
		if (device_num != -1)
			break;
		else
			/*
			 * IF input parameter device_num is -1,
			 * create HCA list for all HCAs.
			 */
			prefix = ",";
	}
	list_iterator_destroy(iter);

	return;
}

/*
 * Write device name and port number for all HCAs
 * to slurmd spool directory.
 */
static void _write_all_hcalist(void)
{
	char *prefix = "", *slurmd_spooldir = NULL, *file_path = NULL;
	int device_num = -1;
	FILE *fp = NULL;

	if (!gres_devices) {
		error("%s: no device data", __func__);
		return;
	}

	// Create set of device name and port number as HCA list.
	_create_hca_list(device_num, prefix); 

	if (!gres_hcalist) {
		error("%s: no device list", __func__);
		return;
	}
	debug("%s: gres_hcalist: %s", __func__, gres_hcalist);

	slurmd_spooldir = _get_slurmd_spooldir();
	if (!slurmd_spooldir) {
		error("%s: fail to get slurmd_spooldir", __func__);
		xfree(gres_hcalist);
		return;
	}
	debug("%s: slurmd_spooldir: %s", __func__, slurmd_spooldir);

	xstrfmtcat(file_path, "%s/%s", slurmd_spooldir, all_hcalist_name);
	xfree(slurmd_spooldir);
	if ((fp = fopen(file_path, "w+")) == NULL) {
		error("%s: fopen(%s): %m", __func__, file_path);
		xfree(gres_hcalist);
		xfree(file_path);
		return;
	}
	debug("%s: file_path: %s", __func__, file_path);

	if (fprintf(fp, "%s\n", gres_hcalist) < 0)
		error("%s: fprintf(%s): %m", __func__, file_path);
	fclose(fp);
	xfree(gres_hcalist);
	xfree(file_path);

	return;
}

/*
 * Set device name and port number for HCAs allocated to a job or job step
 * to environment variable for ScaTeFS.
 */
static void _set_env_hca_list(char ***env_ptr, char *local_list)
{
	char *ve_list = NULL, *vehca_list = NULL;
	char *ve_prefix = NULL, *hca_prefix = "";
	char *startp = NULL, *endp = NULL;
	int device_num;

	if (!gres_devices) {
		error("%s: no device data", __func__);
		return;
	}

	debug("%s: local_list: %s", __func__, local_list);
	ve_list = xstrdup(getenvp(*env_ptr, "_VENODELIST"));
	if (!ve_list) {
		error("%s: fail to get env venodelist", __func__);
		return;
	}
	debug("%s: ve_list: %s", __func__, ve_list);

	startp = local_list;
	do {
		device_num = strtoul(startp, &endp, 10);
		if (startp == endp)
			break;

		// Create set of device name and port number as HCA list.
		_create_hca_list(device_num, hca_prefix);

		if (*endp == ',') {
			hca_prefix = ",";
			startp = (endp + 1);
		} else
			startp = endp;
	} while((*endp != '\0'));

	if ((*endp != '\0') || (!gres_hcalist)) {
		xfree(ve_list);
		if (gres_hcalist)
			xfree(gres_hcalist);
		error("%s: no device list", __func__);
		return;
	}
	debug("%s: gres_hcalist: %s", __func__, gres_hcalist);

	// Set device name and port number as much as number of VEs.
	xstrcat(vehca_list, gres_hcalist);
	startp = ve_list;
	while ((ve_prefix = xstrchr(startp, ' '))) {
		xstrcatchar(vehca_list, *ve_prefix);
		xstrcat(vehca_list, gres_hcalist);
		startp = (ve_prefix + 1);
	}
	debug("%s: vehca_list: %s", __func__, vehca_list);
	env_array_overwrite(env_ptr, "_NEC_HCA_LIST_IO", vehca_list);
	env_array_overwrite(env_ptr, "_NEC_HCA_LIST_MPI", vehca_list);

	xfree(ve_list);
	xfree(gres_hcalist);
	xfree(vehca_list);

	return;
}

static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx,
		     bitstr_t *usable_gres,
		     bool *already_seen, int *local_inx,
		     bool reset, bool is_job, gres_internal_flags_t flags)
{
	char *global_list = NULL, *local_list = NULL, *slurm_env_var = NULL;

	if (is_job)
			slurm_env_var = "SLURM_JOB_HCAS";
	else
			slurm_env_var = "SLURM_STEP_HCAS";

	if (*already_seen) {
		global_list = xstrdup(getenvp(*env_ptr, slurm_env_var));
		local_list = xstrdup(getenvp(*env_ptr,
					     "HCA_VISIBLE_DEVICES"));
	}

	common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx,
			    usable_gres, "", local_inx, NULL,
			    &local_list, &global_list, reset, is_job, NULL,
			    flags);

	if (global_list) {
		env_array_overwrite(env_ptr, slurm_env_var, global_list);
		xfree(global_list);
	}

	if (local_list) {
		env_array_overwrite(
			env_ptr, "HCA_VISIBLE_DEVICES", local_list);

		/*
		 * Set device name and port number for HCA
		 * allocated to a job or job step
		 * to environment variable for ScaTeFS.
		 */
		_set_env_hca_list(env_ptr, local_list);

		xfree(local_list);
		*already_seen = true;
	}
}

extern int init(void)
{
	debug("%s: %s loaded", __func__, plugin_name);

	return SLURM_SUCCESS;
}

extern int fini(void)
{
	debug("%s: unloading %s", __func__, plugin_name);
	FREE_NULL_LIST(gres_devices);

	return SLURM_SUCCESS;
}

/*
 * We could load gres state or validate it using various mechanisms here.
 * This only validates that the configuration was specified in gres.conf.
 * In the general case, no code would need to be changed.
 */
extern int node_config_load(List gres_conf_list, node_config_load_t *config)
{
	int rc = SLURM_SUCCESS;

	/* Assume this state is caused by an scontrol reconfigure */
	if (gres_devices) {
		debug("%s Resetting gres_devices", plugin_name);
		FREE_NULL_LIST(gres_devices);
	}

	rc = common_node_config_load(gres_conf_list, gres_name, &gres_devices);

	if (rc != SLURM_SUCCESS)
		fatal("%s failed to load configuration", plugin_name);
	else {
		// Write HCA device information for Aurora system.
		_write_hca_devinfo();

		// Write device name and port number for all HCAs.
		_write_all_hcalist();
	}

	return rc;
}

/*
 * Set environment variables as appropriate for a job (i.e. all tasks) based
 * upon the job's GRES state.
 */
extern void job_set_env(char ***job_env_ptr, void *gres_ptr, int node_inx,
			gres_internal_flags_t flags)
{
	/*
	 * Variables are not static like in step_*_env since we could be calling
	 * this from the slurmd where we are dealing with a different job each
	 * time we hit this function, so we don't want to keep track of other
	 * unrelated job's status.  This can also get called multiple times
	 * (different prologs and such) which would also result in bad info each
	 * call after the first.
	 */
	int local_inx = 0;
	bool already_seen = false;

	_set_env(job_env_ptr, gres_ptr, node_inx, NULL,
		 &already_seen, &local_inx, false, true, flags);
}

/*
 * Set environment variables as appropriate for a job (i.e. all tasks) based
 * upon the job step's GRES state.
 */
extern void step_set_env(char ***step_env_ptr, void *gres_ptr,
			 gres_internal_flags_t flags)
{
	static int local_inx = 0;
	static bool already_seen = false;

	_set_env(step_env_ptr, gres_ptr, 0, NULL,
		 &already_seen, &local_inx, false, false, flags);
}

/*
 * Reset environment variables as appropriate for a job (i.e. this one task)
 * based upon the job step's GRES state and assigned CPUs.
 */
extern void step_reset_env(char ***step_env_ptr, void *gres_ptr,
			   bitstr_t *usable_gres, gres_internal_flags_t flags)
{
	static int local_inx = 0;
	static bool already_seen = false;

	_set_env(step_env_ptr, gres_ptr, 0, usable_gres,
		 &already_seen, &local_inx, true, false, flags);
}

/* Send GRES information to slurmstepd on the specified file descriptor */
extern void send_stepd(Buf buffer)
{
	common_send_stepd(buffer, gres_devices);
}

/* Receive GRES information from slurmd on the specified file descriptor */
extern void recv_stepd(Buf buffer)
{
	common_recv_stepd(buffer, &gres_devices);
}

/*
 * get data from a job's GRES data structure
 * IN job_gres_data  - job's GRES data structure
 * IN node_inx - zero-origin index of the node within the job's allocation
 *	for which data is desired
 * IN data_type - type of data to get from the job's data
 * OUT data - pointer to the data from job's GRES data structure
 *            DO NOT FREE: This is a pointer into the job's data structure
 * RET - SLURM_SUCCESS or error code
 */
extern int job_info(gres_job_state_t *job_gres_data, uint32_t node_inx,
		    enum gres_job_data_type data_type, void *data)
{
	return EINVAL;
}

/*
 * get data from a step's GRES data structure
 * IN step_gres_data  - step's GRES data structure
 * IN node_inx - zero-origin index of the node within the job's allocation
 *	for which data is desired. Note this can differ from the step's
 *	node allocation index.
 * IN data_type - type of data to get from the step's data
 * OUT data - pointer to the data from step's GRES data structure
 *            DO NOT FREE: This is a pointer into the step's data structure
 * RET - SLURM_SUCCESS or error code
 */
extern int step_info(gres_step_state_t *step_gres_data, uint32_t node_inx,
		     enum gres_step_data_type data_type, void *data)
{
	return EINVAL;
}

/*
 * Return a list of devices of this type. The list elements are of type
 * "gres_device_t" and the list should be freed using FREE_NULL_LIST().
 */
extern List get_devices(void)
{
	return gres_devices;
}

extern void step_hardware_init(bitstr_t *usable_gres, char *settings)
{
	return;
}

extern void step_hardware_fini(void)
{
	return;
}

/*
 * Build record used to set environment variables as appropriate for a job's
 * prolog or epilog based GRES allocated to the job.
 */
extern gres_epilog_info_t *epilog_build_env(gres_job_state_t *gres_job_ptr)
{
	return NULL;
}

/*
 * Set environment variables as appropriate for a job's prolog or epilog based
 * GRES allocated to the job.
 */
extern void epilog_set_env(char ***epilog_env_ptr,
			   gres_epilog_info_t *epilog_info, int node_inx)
{
	return;
}
