`tgb.nodeproppred`

`NodePropPredDataset`

Bases: object

Source code in tgb/nodeproppred/dataset.py

class NodePropPredDataset(object):
    def __init__(
        self,
        name: str,
        root: Optional[str] = "datasets",
        meta_dict: Optional[dict] = None,
        preprocess: Optional[bool] = True,
    ) -> None:
        r"""Dataset class for the node property prediction task. Stores meta information about each dataset such as evaluation metrics etc.
        also automatically pre-processes the dataset.
        [!] node property prediction datasets requires the following:
        self.meta_dict["fname"]: path to the edge list file
        self.meta_dict["nodefile"]: path to the node label file

        Parameters:
            name: name of the dataset
            root: root directory to store the dataset folder
            meta_dict: dictionary containing meta information about the dataset, should contain key 'dir_name' which is the name of the dataset folder
            preprocess: whether to pre-process the dataset
        Returns:
            None
        """
        self.name = name  ## original name
        # check if dataset url exist
        if self.name in DATA_URL_DICT:
            self.url = DATA_URL_DICT[self.name]
        else:
            self.url = None
            print(f"Dataset {self.name} url not found, download not supported yet.")

        # check if the evaluatioin metric are specified
        if self.name in DATA_EVAL_METRIC_DICT:
            self.metric = DATA_EVAL_METRIC_DICT[self.name]
        else:
            self.metric = None
            print(
                f"Dataset {self.name} default evaluation metric not found, it is not supported yet."
            )

        root = PROJ_DIR + root

        if meta_dict is None:
            self.dir_name = "_".join(name.split("-"))  ## replace hyphen with underline
            meta_dict = {"dir_name": self.dir_name}
        else:
            self.dir_name = meta_dict["dir_name"]
        self.root = osp.join(root, self.dir_name)
        self.meta_dict = meta_dict
        if "fname" not in self.meta_dict:
            self.meta_dict["fname"] = self.root + "/" + self.name + "_edgelist.csv"
            self.meta_dict["nodefile"] = self.root + "/" + self.name + "_node_labels.csv"

         #! version check
        self.version_passed = True
        self._version_check()

        self._num_classes = DATA_NUM_CLASSES[self.name]

        # initialize
        self._node_feat = None
        self._edge_feat = None
        self._full_data = None
        self.download()
        # check if the root directory exists, if not create it
        if osp.isdir(self.root):
            print("Dataset directory is ", self.root)
        else:
            raise FileNotFoundError(f"Directory not found at {self.root}")

        if preprocess:
            self.pre_process()

        self.label_ts_idx = 0  # index for which node lables to return now

    def _version_check(self) -> None:
        r"""Implement Version checks for dataset files
        updates the file names based on the current version number
        prompt the user to download the new version via self.version_passed variable
        """
        if (self.name in DATA_VERSION_DICT):
            version = DATA_VERSION_DICT[self.name]
        else:
            print(f"Dataset {self.name} version number not found.")
            self.version_passed = False
            return None

        if (version > 1):
            #* check if current version is outdated
            self.meta_dict["fname"] = self.root + "/" + self.name + "_edgelist_v" + str(int(version)) + ".csv"
            self.meta_dict["nodefile"] = self.root + "/" + self.name + "_node_labels_v" + str(int(version)) + ".csv"

            if (not osp.exists(self.meta_dict["fname"])):
                print(f"Dataset {self.name} version {int(version)} not found.")
                print(f"Please download the latest version of the dataset.")
                self.version_passed = False
                return None

    def download(self) -> None:
        r"""
        downloads this dataset from url
        check if files are already downloaded
        Returns:
            None
        """
        # check if the file already exists
        if osp.exists(self.meta_dict["fname"]) and osp.exists(
            self.meta_dict["nodefile"]
        ):
            print("raw file found, skipping download")
            return

        else:
            inp = input(
                "Will you download the dataset(s) now? (y/N)\n"
            ).lower()  # ask if the user wants to download the dataset
            if inp == "y":
                print(
                    f"{BColors.WARNING}Download started, this might take a while . . . {BColors.ENDC}"
                )
                print(f"Dataset title: {self.name}")

                if self.url is None:
                    raise Exception(
                        "Dataset url not found, download not supported yet."
                    )
                else:
                    r = requests.get(self.url, stream=True)
                    if osp.isdir(self.root):
                        print("Dataset directory is ", self.root)
                    else:
                        os.makedirs(self.root)

                    path_download = self.root + "/" + self.name + ".zip"
                    with open(path_download, "wb") as f:
                        total_length = int(r.headers.get("content-length"))
                        for chunk in progress.bar(
                            r.iter_content(chunk_size=1024),
                            expected_size=(total_length / 1024) + 1,
                        ):
                            if chunk:
                                f.write(chunk)
                                f.flush()
                    # for unzipping the file
                    with zipfile.ZipFile(path_download, "r") as zip_ref:
                        zip_ref.extractall(self.root)
                    print(f"{BColors.OKGREEN}Download completed {BColors.ENDC}")
            else:
                raise Exception(
                    BColors.FAIL
                    + "Data not found error, download "
                    + self.name
                    + " failed"
                )

    def generate_processed_files(
        self,
    ) -> Tuple[pd.DataFrame, Dict[int, Dict[str, Any]]]:
        r"""
        returns an edge list of pandas data frame
        Returns:
            df: pandas data frame storing the temporal edge list
            node_label_dict: dictionary with key as timestamp and item as dictionary of node labels
        """
        OUT_DF = self.root + "/" + "ml_{}.pkl".format(self.name)
        OUT_NODE_DF = self.root + "/" + "ml_{}_node.pkl".format(self.name)
        OUT_LABEL_DF = self.root + "/" + "ml_{}_label.pkl".format(self.name)
        OUT_EDGE_FEAT = self.root + "/" + "ml_{}.pkl".format(self.name + "_edge")

        # * logic for large datasets, as node label file is too big to store on disc
        if self.name == "tgbn-reddit" or self.name == "tgbn-token":
            if osp.exists(OUT_DF) and osp.exists(OUT_NODE_DF) and osp.exists(OUT_EDGE_FEAT):
                df = pd.read_pickle(OUT_DF)
                edge_feat = load_pkl(OUT_EDGE_FEAT)
                if (self.name == "tgbn-token"):
                    #! taking log normalization for numerical stability
                    print ("applying log normalization for weights in tgbn-token")
                    edge_feat[:,0] = np.log(edge_feat[:,0])
                node_ids = load_pkl(OUT_NODE_DF)
                labels_dict = load_pkl(OUT_LABEL_DF)
                node_label_dict = load_label_dict(
                    self.meta_dict["nodefile"], node_ids, labels_dict
                )
                return df, node_label_dict, edge_feat

        # * load the preprocessed file if possible
        if osp.exists(OUT_DF) and osp.exists(OUT_NODE_DF) and osp.exists(OUT_EDGE_FEAT):
            print("loading processed file")
            df = pd.read_pickle(OUT_DF)
            node_label_dict = load_pkl(OUT_NODE_DF)
            edge_feat = load_pkl(OUT_EDGE_FEAT)
        else:  # * process the file
            print("file not processed, generating processed file")
            if self.name == "tgbn-reddit":
                df, edge_feat, node_ids, labels_dict = load_edgelist_sr(
                    self.meta_dict["fname"], label_size=self._num_classes
                )
            elif self.name == "tgbn-token":
                df, edge_feat, node_ids, labels_dict = load_edgelist_token(
                    self.meta_dict["fname"], label_size=self._num_classes
                )
            elif self.name == "tgbn-genre":
                df, edge_feat, node_ids, labels_dict = load_edgelist_datetime(
                    self.meta_dict["fname"], label_size=self._num_classes
                )
            elif self.name == "tgbn-trade":
                df, edge_feat, node_ids = load_edgelist_trade(
                    self.meta_dict["fname"], label_size=self._num_classes
                )

            df.to_pickle(OUT_DF)
            save_pkl(edge_feat, OUT_EDGE_FEAT)

            if self.name == "tgbn-trade":
                node_label_dict = load_trade_label_dict(
                    self.meta_dict["nodefile"], node_ids
                )
            else:
                node_label_dict = load_label_dict(
                    self.meta_dict["nodefile"], node_ids, labels_dict
                )

            if (
                self.name != "tgbn-reddit" and self.name != "tgbn-token"
            ):  # don't save subreddits on disc, the node label file is too big
                save_pkl(node_label_dict, OUT_NODE_DF)
            else:
                save_pkl(node_ids, OUT_NODE_DF)
                save_pkl(labels_dict, OUT_LABEL_DF)

            print("file processed and saved")
        return df, node_label_dict, edge_feat

    def pre_process(self) -> None:
        """
        Pre-process the dataset and generates the splits, must be run before dataset properties can be accessed
        Returns:
            None
        """
        # first check if all files exist
        if ("fname" not in self.meta_dict) or ("nodefile" not in self.meta_dict):
            raise Exception("meta_dict does not contain all required filenames")

        df, node_label_dict, edge_feat = self.generate_processed_files()
        sources = np.array(df["u"])
        destinations = np.array(df["i"])
        timestamps = np.array(df["ts"])
        edge_idxs = np.array(df["idx"])
        edge_label = np.ones(sources.shape[0])
        #self._edge_feat = np.array(df["w"])
        self._edge_feat = edge_feat

        full_data = {
            "sources": sources,
            "destinations": destinations,
            "timestamps": timestamps,
            "edge_idxs": edge_idxs,
            "edge_feat": self._edge_feat,
            "edge_label": edge_label,
        }
        self._full_data = full_data

        # storing the split masks
        _train_mask, _val_mask, _test_mask = self.generate_splits(full_data)

        self._train_mask = _train_mask
        self._val_mask = _val_mask
        self._test_mask = _test_mask

        self.label_dict = node_label_dict
        self.label_ts = np.array(list(node_label_dict.keys()))
        self.label_ts = np.sort(self.label_ts)

    def generate_splits(
        self,
        full_data: Dict[str, Any],
        val_ratio: float = 0.15,
        test_ratio: float = 0.15,
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        r"""
        Generates train, validation, and test splits from the full dataset
        Parameters:
            full_data: dictionary containing the full dataset
            val_ratio: ratio of validation data
            test_ratio: ratio of test data
        Returns:
            train_mask: boolean mask for training data
            val_mask: boolean mask for validation data
            test_mask: boolean mask for test data
        """
        val_time, test_time = list(
            np.quantile(
                full_data["timestamps"],
                [(1 - val_ratio - test_ratio), (1 - test_ratio)],
            )
        )
        timestamps = full_data["timestamps"]
        train_mask = timestamps <= val_time
        val_mask = np.logical_and(timestamps <= test_time, timestamps > val_time)
        test_mask = timestamps > test_time

        return train_mask, val_mask, test_mask

    def find_next_labels_batch(
        self,
        cur_t: int,
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        r"""
        this returns the node labels closest to cur_t (for that given day)
        Parameters:
            cur_t: current timestamp of the batch of edges
        Returns:
            ts: timestamp of the node labels
            source_idx: node ids
            labels: the stacked label vectors
        """
        if self.label_ts_idx >= (self.label_ts.shape[0]):
            # for query that are after the last batch of labels
            return None
        else:
            ts = self.label_ts[self.label_ts_idx]

        if cur_t >= ts:
            self.label_ts_idx += 1  # move to the next ts
            # {ts: {node_id: label_vec}}
            node_ids = np.array(list(self.label_dict[ts].keys()))

            node_labels = []
            for key in self.label_dict[ts]:
                node_labels.append(np.array(self.label_dict[ts][key]))
            node_labels = np.stack(node_labels, axis=0)
            label_ts = np.full(node_ids.shape[0], ts, dtype="int")
            return (label_ts, node_ids, node_labels)
        else:
            return None

    def reset_label_time(self) -> None:
        r"""
        reset the pointer for node label once the entire dataset has been iterated once
        Returns:
            None
        """
        self.label_ts_idx = 0

    def return_label_ts(self) -> int:
        """
        return the current label timestamp that the pointer is at
        Returns:
            ts: int, the timestamp of the node labels
        """
        if (self.label_ts_idx >= self.label_ts.shape[0]):
            return self.label_ts[-1]
        else:
            return self.label_ts[self.label_ts_idx]

    @property
    def num_classes(self) -> int:
        """
        number of classes in the node label
        Returns:
            num_classes: int, number of classes
        """
        return self._num_classes

    @property
    def eval_metric(self) -> str:
        """
        the official evaluation metric for the dataset, loaded from info.py
        Returns:
            eval_metric: str, the evaluation metric
        """
        return self.metric

    # TODO not sure needed, to be removed
    @property
    def node_feat(self) -> Optional[np.ndarray]:
        r"""
        Returns the node features of the dataset with dim [N, feat_dim]
        Returns:
            node_feat: np.ndarray, [N, feat_dim] or None if there is no node feature
        """
        return self._node_feat

    # TODO not sure needed, to be removed
    @property
    def edge_feat(self) -> Optional[np.ndarray]:
        r"""
        Returns the edge features of the dataset with dim [E, feat_dim]
        Returns:
            edge_feat: np.ndarray, [E, feat_dim] or None if there is no edge feature
        """
        return self._edge_feat

    @property
    def full_data(self) -> Dict[str, Any]:
        r"""
        the full data of the dataset as a dictionary with keys: 'sources', 'destinations', 'timestamps', 'edge_idxs', 'edge_feat', 'w', 'edge_label',

        Returns:
            full_data: Dict[str, Any]
        """
        if self._full_data is None:
            raise ValueError(
                "dataset has not been processed yet, please call pre_process() first"
            )
        return self._full_data

    @property
    def train_mask(self) -> np.ndarray:
        r"""
        Returns the train mask of the dataset
        Returns:
            train_mask
        """
        if self._train_mask is None:
            raise ValueError("training split hasn't been loaded")
        return self._train_mask

    @property
    def val_mask(self) -> np.ndarray:
        r"""
        Returns the validation mask of the dataset
        Returns:
            val_mask: Dict[str, Any]
        """
        if self._val_mask is None:
            raise ValueError("validation split hasn't been loaded")

        return self._val_mask

    @property
    def test_mask(self) -> np.ndarray:
        r"""
        Returns the test mask of the dataset:
        Returns:
            test_mask: Dict[str, Any]
        """
        if self._test_mask is None:
            raise ValueError("test split hasn't been loaded")

        return self._test_mask

`edge_feat: Optional[np.ndarray]` `property`

Returns the edge features of the dataset with dim [E, feat_dim] Returns: edge_feat: np.ndarray, [E, feat_dim] or None if there is no edge feature

`eval_metric: str` `property`

the official evaluation metric for the dataset, loaded from info.py Returns: eval_metric: str, the evaluation metric

`full_data: Dict[str, Any]` `property`

the full data of the dataset as a dictionary with keys: 'sources', 'destinations', 'timestamps', 'edge_idxs', 'edge_feat', 'w', 'edge_label',

Returns:

Name	Type	Description
`full_data`	`Dict[str, Any]`	Dict[str, Any]

`node_feat: Optional[np.ndarray]` `property`

Returns the node features of the dataset with dim [N, feat_dim] Returns: node_feat: np.ndarray, [N, feat_dim] or None if there is no node feature

`num_classes: int` `property`

number of classes in the node label Returns: num_classes: int, number of classes

`test_mask: np.ndarray` `property`

Returns the test mask of the dataset: Returns: test_mask: Dict[str, Any]

`train_mask: np.ndarray` `property`

Returns the train mask of the dataset Returns: train_mask

`val_mask: np.ndarray` `property`

Returns the validation mask of the dataset Returns: val_mask: Dict[str, Any]

`init(name, root='datasets', meta_dict=None, preprocess=True)`

Dataset class for the node property prediction task. Stores meta information about each dataset such as evaluation metrics etc. also automatically pre-processes the dataset. [!] node property prediction datasets requires the following: self.meta_dict["fname"]: path to the edge list file self.meta_dict["nodefile"]: path to the node label file

Parameters:

Name	Type	Description	Default
`name`	`str`	name of the dataset	required
`root`	`Optional[str]`	root directory to store the dataset folder	`'datasets'`
`meta_dict`	`Optional[dict]`	dictionary containing meta information about the dataset, should contain key 'dir_name' which is the name of the dataset folder	`None`
`preprocess`	`Optional[bool]`	whether to pre-process the dataset	`True`

Returns: None

Source code in tgb/nodeproppred/dataset.py

def __init__(
    self,
    name: str,
    root: Optional[str] = "datasets",
    meta_dict: Optional[dict] = None,
    preprocess: Optional[bool] = True,
) -> None:
    r"""Dataset class for the node property prediction task. Stores meta information about each dataset such as evaluation metrics etc.
    also automatically pre-processes the dataset.
    [!] node property prediction datasets requires the following:
    self.meta_dict["fname"]: path to the edge list file
    self.meta_dict["nodefile"]: path to the node label file

    Parameters:
        name: name of the dataset
        root: root directory to store the dataset folder
        meta_dict: dictionary containing meta information about the dataset, should contain key 'dir_name' which is the name of the dataset folder
        preprocess: whether to pre-process the dataset
    Returns:
        None
    """
    self.name = name  ## original name
    # check if dataset url exist
    if self.name in DATA_URL_DICT:
        self.url = DATA_URL_DICT[self.name]
    else:
        self.url = None
        print(f"Dataset {self.name} url not found, download not supported yet.")

    # check if the evaluatioin metric are specified
    if self.name in DATA_EVAL_METRIC_DICT:
        self.metric = DATA_EVAL_METRIC_DICT[self.name]
    else:
        self.metric = None
        print(
            f"Dataset {self.name} default evaluation metric not found, it is not supported yet."
        )

    root = PROJ_DIR + root

    if meta_dict is None:
        self.dir_name = "_".join(name.split("-"))  ## replace hyphen with underline
        meta_dict = {"dir_name": self.dir_name}
    else:
        self.dir_name = meta_dict["dir_name"]
    self.root = osp.join(root, self.dir_name)
    self.meta_dict = meta_dict
    if "fname" not in self.meta_dict:
        self.meta_dict["fname"] = self.root + "/" + self.name + "_edgelist.csv"
        self.meta_dict["nodefile"] = self.root + "/" + self.name + "_node_labels.csv"

     #! version check
    self.version_passed = True
    self._version_check()

    self._num_classes = DATA_NUM_CLASSES[self.name]

    # initialize
    self._node_feat = None
    self._edge_feat = None
    self._full_data = None
    self.download()
    # check if the root directory exists, if not create it
    if osp.isdir(self.root):
        print("Dataset directory is ", self.root)
    else:
        raise FileNotFoundError(f"Directory not found at {self.root}")

    if preprocess:
        self.pre_process()

    self.label_ts_idx = 0  # index for which node lables to return now

`download()`

downloads this dataset from url check if files are already downloaded Returns: None

Source code in tgb/nodeproppred/dataset.py

def download(self) -> None:
    r"""
    downloads this dataset from url
    check if files are already downloaded
    Returns:
        None
    """
    # check if the file already exists
    if osp.exists(self.meta_dict["fname"]) and osp.exists(
        self.meta_dict["nodefile"]
    ):
        print("raw file found, skipping download")
        return

    else:
        inp = input(
            "Will you download the dataset(s) now? (y/N)\n"
        ).lower()  # ask if the user wants to download the dataset
        if inp == "y":
            print(
                f"{BColors.WARNING}Download started, this might take a while . . . {BColors.ENDC}"
            )
            print(f"Dataset title: {self.name}")

            if self.url is None:
                raise Exception(
                    "Dataset url not found, download not supported yet."
                )
            else:
                r = requests.get(self.url, stream=True)
                if osp.isdir(self.root):
                    print("Dataset directory is ", self.root)
                else:
                    os.makedirs(self.root)

                path_download = self.root + "/" + self.name + ".zip"
                with open(path_download, "wb") as f:
                    total_length = int(r.headers.get("content-length"))
                    for chunk in progress.bar(
                        r.iter_content(chunk_size=1024),
                        expected_size=(total_length / 1024) + 1,
                    ):
                        if chunk:
                            f.write(chunk)
                            f.flush()
                # for unzipping the file
                with zipfile.ZipFile(path_download, "r") as zip_ref:
                    zip_ref.extractall(self.root)
                print(f"{BColors.OKGREEN}Download completed {BColors.ENDC}")
        else:
            raise Exception(
                BColors.FAIL
                + "Data not found error, download "
                + self.name
                + " failed"
            )

`find_next_labels_batch(cur_t)`

this returns the node labels closest to cur_t (for that given day) Parameters: cur_t: current timestamp of the batch of edges Returns: ts: timestamp of the node labels source_idx: node ids labels: the stacked label vectors

Source code in tgb/nodeproppred/dataset.py

def find_next_labels_batch(
    self,
    cur_t: int,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    r"""
    this returns the node labels closest to cur_t (for that given day)
    Parameters:
        cur_t: current timestamp of the batch of edges
    Returns:
        ts: timestamp of the node labels
        source_idx: node ids
        labels: the stacked label vectors
    """
    if self.label_ts_idx >= (self.label_ts.shape[0]):
        # for query that are after the last batch of labels
        return None
    else:
        ts = self.label_ts[self.label_ts_idx]

    if cur_t >= ts:
        self.label_ts_idx += 1  # move to the next ts
        # {ts: {node_id: label_vec}}
        node_ids = np.array(list(self.label_dict[ts].keys()))

        node_labels = []
        for key in self.label_dict[ts]:
            node_labels.append(np.array(self.label_dict[ts][key]))
        node_labels = np.stack(node_labels, axis=0)
        label_ts = np.full(node_ids.shape[0], ts, dtype="int")
        return (label_ts, node_ids, node_labels)
    else:
        return None

`generate_processed_files()`

returns an edge list of pandas data frame Returns: df: pandas data frame storing the temporal edge list node_label_dict: dictionary with key as timestamp and item as dictionary of node labels

Source code in tgb/nodeproppred/dataset.py

def generate_processed_files(
    self,
) -> Tuple[pd.DataFrame, Dict[int, Dict[str, Any]]]:
    r"""
    returns an edge list of pandas data frame
    Returns:
        df: pandas data frame storing the temporal edge list
        node_label_dict: dictionary with key as timestamp and item as dictionary of node labels
    """
    OUT_DF = self.root + "/" + "ml_{}.pkl".format(self.name)
    OUT_NODE_DF = self.root + "/" + "ml_{}_node.pkl".format(self.name)
    OUT_LABEL_DF = self.root + "/" + "ml_{}_label.pkl".format(self.name)
    OUT_EDGE_FEAT = self.root + "/" + "ml_{}.pkl".format(self.name + "_edge")

    # * logic for large datasets, as node label file is too big to store on disc
    if self.name == "tgbn-reddit" or self.name == "tgbn-token":
        if osp.exists(OUT_DF) and osp.exists(OUT_NODE_DF) and osp.exists(OUT_EDGE_FEAT):
            df = pd.read_pickle(OUT_DF)
            edge_feat = load_pkl(OUT_EDGE_FEAT)
            if (self.name == "tgbn-token"):
                #! taking log normalization for numerical stability
                print ("applying log normalization for weights in tgbn-token")
                edge_feat[:,0] = np.log(edge_feat[:,0])
            node_ids = load_pkl(OUT_NODE_DF)
            labels_dict = load_pkl(OUT_LABEL_DF)
            node_label_dict = load_label_dict(
                self.meta_dict["nodefile"], node_ids, labels_dict
            )
            return df, node_label_dict, edge_feat

    # * load the preprocessed file if possible
    if osp.exists(OUT_DF) and osp.exists(OUT_NODE_DF) and osp.exists(OUT_EDGE_FEAT):
        print("loading processed file")
        df = pd.read_pickle(OUT_DF)
        node_label_dict = load_pkl(OUT_NODE_DF)
        edge_feat = load_pkl(OUT_EDGE_FEAT)
    else:  # * process the file
        print("file not processed, generating processed file")
        if self.name == "tgbn-reddit":
            df, edge_feat, node_ids, labels_dict = load_edgelist_sr(
                self.meta_dict["fname"], label_size=self._num_classes
            )
        elif self.name == "tgbn-token":
            df, edge_feat, node_ids, labels_dict = load_edgelist_token(
                self.meta_dict["fname"], label_size=self._num_classes
            )
        elif self.name == "tgbn-genre":
            df, edge_feat, node_ids, labels_dict = load_edgelist_datetime(
                self.meta_dict["fname"], label_size=self._num_classes
            )
        elif self.name == "tgbn-trade":
            df, edge_feat, node_ids = load_edgelist_trade(
                self.meta_dict["fname"], label_size=self._num_classes
            )

        df.to_pickle(OUT_DF)
        save_pkl(edge_feat, OUT_EDGE_FEAT)

        if self.name == "tgbn-trade":
            node_label_dict = load_trade_label_dict(
                self.meta_dict["nodefile"], node_ids
            )
        else:
            node_label_dict = load_label_dict(
                self.meta_dict["nodefile"], node_ids, labels_dict
            )

        if (
            self.name != "tgbn-reddit" and self.name != "tgbn-token"
        ):  # don't save subreddits on disc, the node label file is too big
            save_pkl(node_label_dict, OUT_NODE_DF)
        else:
            save_pkl(node_ids, OUT_NODE_DF)
            save_pkl(labels_dict, OUT_LABEL_DF)

        print("file processed and saved")
    return df, node_label_dict, edge_feat

`generate_splits(full_data, val_ratio=0.15, test_ratio=0.15)`

Generates train, validation, and test splits from the full dataset Parameters: full_data: dictionary containing the full dataset val_ratio: ratio of validation data test_ratio: ratio of test data Returns: train_mask: boolean mask for training data val_mask: boolean mask for validation data test_mask: boolean mask for test data

Source code in tgb/nodeproppred/dataset.py

def generate_splits(
    self,
    full_data: Dict[str, Any],
    val_ratio: float = 0.15,
    test_ratio: float = 0.15,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    r"""
    Generates train, validation, and test splits from the full dataset
    Parameters:
        full_data: dictionary containing the full dataset
        val_ratio: ratio of validation data
        test_ratio: ratio of test data
    Returns:
        train_mask: boolean mask for training data
        val_mask: boolean mask for validation data
        test_mask: boolean mask for test data
    """
    val_time, test_time = list(
        np.quantile(
            full_data["timestamps"],
            [(1 - val_ratio - test_ratio), (1 - test_ratio)],
        )
    )
    timestamps = full_data["timestamps"]
    train_mask = timestamps <= val_time
    val_mask = np.logical_and(timestamps <= test_time, timestamps > val_time)
    test_mask = timestamps > test_time

    return train_mask, val_mask, test_mask

`pre_process()`

Pre-process the dataset and generates the splits, must be run before dataset properties can be accessed Returns: None

Source code in tgb/nodeproppred/dataset.py

def pre_process(self) -> None:
    """
    Pre-process the dataset and generates the splits, must be run before dataset properties can be accessed
    Returns:
        None
    """
    # first check if all files exist
    if ("fname" not in self.meta_dict) or ("nodefile" not in self.meta_dict):
        raise Exception("meta_dict does not contain all required filenames")

    df, node_label_dict, edge_feat = self.generate_processed_files()
    sources = np.array(df["u"])
    destinations = np.array(df["i"])
    timestamps = np.array(df["ts"])
    edge_idxs = np.array(df["idx"])
    edge_label = np.ones(sources.shape[0])
    #self._edge_feat = np.array(df["w"])
    self._edge_feat = edge_feat

    full_data = {
        "sources": sources,
        "destinations": destinations,
        "timestamps": timestamps,
        "edge_idxs": edge_idxs,
        "edge_feat": self._edge_feat,
        "edge_label": edge_label,
    }
    self._full_data = full_data

    # storing the split masks
    _train_mask, _val_mask, _test_mask = self.generate_splits(full_data)

    self._train_mask = _train_mask
    self._val_mask = _val_mask
    self._test_mask = _test_mask

    self.label_dict = node_label_dict
    self.label_ts = np.array(list(node_label_dict.keys()))
    self.label_ts = np.sort(self.label_ts)

`reset_label_time()`

reset the pointer for node label once the entire dataset has been iterated once Returns: None

Source code in tgb/nodeproppred/dataset.py

def reset_label_time(self) -> None:
    r"""
    reset the pointer for node label once the entire dataset has been iterated once
    Returns:
        None
    """
    self.label_ts_idx = 0

`return_label_ts()`

return the current label timestamp that the pointer is at Returns: ts: int, the timestamp of the node labels

Source code in tgb/nodeproppred/dataset.py

def return_label_ts(self) -> int:
    """
    return the current label timestamp that the pointer is at
    Returns:
        ts: int, the timestamp of the node labels
    """
    if (self.label_ts_idx >= self.label_ts.shape[0]):
        return self.label_ts[-1]
    else:
        return self.label_ts[self.label_ts_idx]

`PyGNodePropPredDataset`

Bases: InMemoryDataset

PyG wrapper for the NodePropPredDataset can return pytorch tensors for src,dst,t,msg,label can return Temporal Data object also query the node labels corresponding to a timestamp from edge batch Parameters: name: name of the dataset, passed to NodePropPredDataset root (string): Root directory where the dataset should be saved. transform (callable, optional): A function/transform that takes in an pre_transform (callable, optional): A function/transform that takes in

Source code in tgb/nodeproppred/dataset_pyg.py

class PyGNodePropPredDataset(InMemoryDataset):
    r"""
    PyG wrapper for the NodePropPredDataset
    can return pytorch tensors for src,dst,t,msg,label
    can return Temporal Data object
    also query the node labels corresponding to a timestamp from edge batch
    Parameters:
        name: name of the dataset, passed to `NodePropPredDataset`
        root (string): Root directory where the dataset should be saved.
        transform (callable, optional): A function/transform that takes in an
        pre_transform (callable, optional): A function/transform that takes in
    """

    def __init__(
        self,
        name: str,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        self.name = name
        self.root = root
        self.dataset = NodePropPredDataset(name=name, root=root)
        self._train_mask = torch.from_numpy(self.dataset.train_mask)
        self._val_mask = torch.from_numpy(self.dataset.val_mask)
        self._test_mask = torch.from_numpy(self.dataset.test_mask)
        self.__num_classes = self.dataset.num_classes
        super().__init__(root, transform, pre_transform)
        self.process_data()

    @property
    def num_classes(self) -> int:
        """
        how many classes are in the node label
        Returns:
            num_classes: int
        """
        return self.__num_classes

    @property
    def eval_metric(self) -> str:
        """
        the official evaluation metric for the dataset, loaded from info.py
        Returns:
            eval_metric: str, the evaluation metric
        """
        return self.dataset.eval_metric

    @property
    def train_mask(self) -> torch.Tensor:
        r"""
        Returns the train mask of the dataset
        Returns:
            train_mask: the mask for edges in the training set
        """
        if self._train_mask is None:
            raise ValueError("training split hasn't been loaded")
        return self._train_mask

    @property
    def val_mask(self) -> torch.Tensor:
        r"""
        Returns the validation mask of the dataset
        Returns:
            val_mask: the mask for edges in the validation set
        """
        if self._val_mask is None:
            raise ValueError("validation split hasn't been loaded")
        return self._val_mask

    @property
    def test_mask(self) -> torch.Tensor:
        r"""
        Returns the test mask of the dataset:
        Returns:
            test_mask: the mask for edges in the test set
        """
        if self._test_mask is None:
            raise ValueError("test split hasn't been loaded")
        return self._test_mask

    @property
    def src(self) -> torch.Tensor:
        r"""
        Returns the source nodes of the dataset
        Returns:
            src: the idx of the source nodes
        """
        return self._src

    @property
    def dst(self) -> torch.Tensor:
        r"""
        Returns the destination nodes of the dataset
        Returns:
            dst: the idx of the destination nodes
        """
        return self._dst

    @property
    def ts(self) -> torch.Tensor:
        r"""
        Returns the timestamps of the dataset
        Returns:
            ts: the timestamps of the edges
        """
        return self._ts

    @property
    def edge_feat(self) -> torch.Tensor:
        r"""
        Returns the edge features of the dataset
        Returns:
            edge_feat: the edge features
        """
        return self._edge_feat

    @property
    def edge_label(self) -> torch.Tensor:
        r"""
        Returns the edge labels of the dataset
        Returns:
            edge_label: the labels of the edges (all one tensor)
        """
        return self._edge_label

    def process_data(self):
        """
        convert data to pytorch tensors
        """
        src = torch.from_numpy(self.dataset.full_data["sources"])
        dst = torch.from_numpy(self.dataset.full_data["destinations"])
        t = torch.from_numpy(self.dataset.full_data["timestamps"])
        edge_label = torch.from_numpy(self.dataset.full_data["edge_label"])
        msg = torch.from_numpy(self.dataset.full_data["edge_feat"])
        # msg = torch.from_numpy(self.dataset.full_data["edge_feat"]).reshape(
        #     [-1, 1]
        # ) 
        # * check typing
        if src.dtype != torch.int64:
            src = src.long()

        if dst.dtype != torch.int64:
            dst = dst.long()

        if t.dtype != torch.int64:
            t = t.long()

        if msg.dtype != torch.float32:
            msg = msg.float()

        self._src = src
        self._dst = dst
        self._ts = t
        self._edge_label = edge_label
        self._edge_feat = msg

    def get_TemporalData(
        self,
    ) -> TemporalData:
        """
        return the TemporalData object for the entire dataset
        Returns:
            data: TemporalData object storing the edgelist
        """
        data = TemporalData(
            src=self._src,
            dst=self._dst,
            t=self._ts,
            msg=self._edge_feat,
            y=self._edge_label,
        )
        return data

    def reset_label_time(self) -> None:
        """
        reset the pointer for the node labels, should be done per epoch
        """
        self.dataset.reset_label_time()

    def get_node_label(self, cur_t):
        """
        return the node labels for the current timestamp
        """
        label_tuple = self.dataset.find_next_labels_batch(cur_t)
        if label_tuple is None:
            return None
        label_ts, label_srcs, labels = label_tuple[0], label_tuple[1], label_tuple[2]
        label_ts = torch.from_numpy(label_ts).long()
        label_srcs = torch.from_numpy(label_srcs).long()
        labels = torch.from_numpy(labels).to(torch.float32)
        return label_ts, label_srcs, labels

    def get_label_time(self) -> int:
        """
        return the timestamps of the current node labels
        Returns:
            t: time of the current node labels
        """
        return self.dataset.return_label_ts()

    def len(self) -> int:
        """
        size of the dataset
        Returns:
            size: int
        """
        return self._src.shape[0]

    def get(self, idx: int) -> TemporalData:
        """
        construct temporal data object for a single edge
        Parameters:
            idx: index of the edge
        Returns:
            data: TemporalData object
        """
        data = TemporalData(
            src=self._src[idx],
            dst=self._dst[idx],
            t=self._ts[idx],
            msg=self._edge_feat[idx],
            y=self._edge_label[idx],
        )
        return data

    def __repr__(self) -> str:
        return f"{self.name.capitalize()}()"

`dst: torch.Tensor` `property`

Returns the destination nodes of the dataset Returns: dst: the idx of the destination nodes

`edge_feat: torch.Tensor` `property`

Returns the edge features of the dataset Returns: edge_feat: the edge features

`edge_label: torch.Tensor` `property`

Returns the edge labels of the dataset Returns: edge_label: the labels of the edges (all one tensor)

`eval_metric: str` `property`

the official evaluation metric for the dataset, loaded from info.py Returns: eval_metric: str, the evaluation metric

`num_classes: int` `property`

how many classes are in the node label Returns: num_classes: int

`src: torch.Tensor` `property`

Returns the source nodes of the dataset Returns: src: the idx of the source nodes

`test_mask: torch.Tensor` `property`

Returns the test mask of the dataset: Returns: test_mask: the mask for edges in the test set

`train_mask: torch.Tensor` `property`

Returns the train mask of the dataset Returns: train_mask: the mask for edges in the training set

`ts: torch.Tensor` `property`

Returns the timestamps of the dataset Returns: ts: the timestamps of the edges

`val_mask: torch.Tensor` `property`

Returns the validation mask of the dataset Returns: val_mask: the mask for edges in the validation set

`get(idx)`

construct temporal data object for a single edge Parameters: idx: index of the edge Returns: data: TemporalData object

Source code in tgb/nodeproppred/dataset_pyg.py

def get(self, idx: int) -> TemporalData:
    """
    construct temporal data object for a single edge
    Parameters:
        idx: index of the edge
    Returns:
        data: TemporalData object
    """
    data = TemporalData(
        src=self._src[idx],
        dst=self._dst[idx],
        t=self._ts[idx],
        msg=self._edge_feat[idx],
        y=self._edge_label[idx],
    )
    return data

`get_TemporalData()`

return the TemporalData object for the entire dataset Returns: data: TemporalData object storing the edgelist

Source code in tgb/nodeproppred/dataset_pyg.py

def get_TemporalData(
    self,
) -> TemporalData:
    """
    return the TemporalData object for the entire dataset
    Returns:
        data: TemporalData object storing the edgelist
    """
    data = TemporalData(
        src=self._src,
        dst=self._dst,
        t=self._ts,
        msg=self._edge_feat,
        y=self._edge_label,
    )
    return data

`get_label_time()`

return the timestamps of the current node labels Returns: t: time of the current node labels

Source code in tgb/nodeproppred/dataset_pyg.py

def get_label_time(self) -> int:
    """
    return the timestamps of the current node labels
    Returns:
        t: time of the current node labels
    """
    return self.dataset.return_label_ts()

`get_node_label(cur_t)`

return the node labels for the current timestamp

Source code in tgb/nodeproppred/dataset_pyg.py

def get_node_label(self, cur_t):
    """
    return the node labels for the current timestamp
    """
    label_tuple = self.dataset.find_next_labels_batch(cur_t)
    if label_tuple is None:
        return None
    label_ts, label_srcs, labels = label_tuple[0], label_tuple[1], label_tuple[2]
    label_ts = torch.from_numpy(label_ts).long()
    label_srcs = torch.from_numpy(label_srcs).long()
    labels = torch.from_numpy(labels).to(torch.float32)
    return label_ts, label_srcs, labels

`len()`

size of the dataset Returns: size: int

Source code in tgb/nodeproppred/dataset_pyg.py

def len(self) -> int:
    """
    size of the dataset
    Returns:
        size: int
    """
    return self._src.shape[0]

`process_data()`

convert data to pytorch tensors

Source code in tgb/nodeproppred/dataset_pyg.py

def process_data(self):
    """
    convert data to pytorch tensors
    """
    src = torch.from_numpy(self.dataset.full_data["sources"])
    dst = torch.from_numpy(self.dataset.full_data["destinations"])
    t = torch.from_numpy(self.dataset.full_data["timestamps"])
    edge_label = torch.from_numpy(self.dataset.full_data["edge_label"])
    msg = torch.from_numpy(self.dataset.full_data["edge_feat"])
    # msg = torch.from_numpy(self.dataset.full_data["edge_feat"]).reshape(
    #     [-1, 1]
    # ) 
    # * check typing
    if src.dtype != torch.int64:
        src = src.long()

    if dst.dtype != torch.int64:
        dst = dst.long()

    if t.dtype != torch.int64:
        t = t.long()

    if msg.dtype != torch.float32:
        msg = msg.float()

    self._src = src
    self._dst = dst
    self._ts = t
    self._edge_label = edge_label
    self._edge_feat = msg

`reset_label_time()`

reset the pointer for the node labels, should be done per epoch

Source code in tgb/nodeproppred/dataset_pyg.py

def reset_label_time(self) -> None:
    """
    reset the pointer for the node labels, should be done per epoch
    """
    self.dataset.reset_label_time()

`Evaluator`

Bases: object

Evaluator for Node Property Prediction

Source code in tgb/nodeproppred/evaluate.py

class Evaluator(object):
    """Evaluator for Node Property Prediction"""

    def __init__(self, name: str):
        r"""
        Parameters:
            name: name of the dataset
        """
        self.name = name
        self.valid_metric_list = ["mse", "rmse", "ndcg"]
        if self.name not in DATA_EVAL_METRIC_DICT:
            raise NotImplementedError("Dataset not supported")

    def _parse_and_check_input(self, input_dict):
        """
        check whether the input has the required format
        Parametrers:
            -input_dict: a dictionary containing "y_true", "y_pred", and "eval_metric"

            note: "eval_metric" should be a list including one or more of the followin metrics:
                    ["mse"]
        """
        # valid_metric_list = ['ap', 'au_roc_score', 'au_pr_score', 'acc', 'prec', 'rec', 'f1']

        if "eval_metric" not in input_dict:
            raise RuntimeError("Missing key of eval_metric")

        for eval_metric in input_dict["eval_metric"]:
            if eval_metric in self.valid_metric_list:
                if "y_true" not in input_dict:
                    raise RuntimeError("Missing key of y_true")
                if "y_pred" not in input_dict:
                    raise RuntimeError("Missing key of y_pred")

                y_true, y_pred = input_dict["y_true"], input_dict["y_pred"]

                # converting to numpy on cpu
                if torch is not None and isinstance(y_true, torch.Tensor):
                    y_true = y_true.detach().cpu().numpy()
                if torch is not None and isinstance(y_pred, torch.Tensor):
                    y_pred = y_pred.detach().cpu().numpy()

                # check type and shape
                if not isinstance(y_true, np.ndarray) or not isinstance(
                    y_pred, np.ndarray
                ):
                    raise RuntimeError(
                        "Arguments to Evaluator need to be either numpy ndarray or torch tensor!"
                    )

                if not y_true.shape == y_pred.shape:
                    raise RuntimeError("Shape of y_true and y_pred must be the same!")

            else:
                print(
                    "ERROR: The evaluation metric should be in:", self.valid_metric_list
                )
                raise ValueError("Undefined eval metric %s " % (eval_metric))
        self.eval_metric = input_dict["eval_metric"]

        return y_true, y_pred

    def _compute_metrics(self, y_true, y_pred):
        """
        compute the performance metrics for the given true labels and prediction probabilities
        Parameters:
            -y_true: actual true labels
            -y_pred: predicted probabilities
        """
        perf_dict = {}
        for eval_metric in self.eval_metric:
            if eval_metric == "mse":
                perf_dict = {
                    "mse": mean_squared_error(y_true, y_pred),
                    "rmse": math.sqrt(mean_squared_error(y_true, y_pred)),
                }
            elif eval_metric == "ndcg":
                k = 10
                perf_dict = {"ndcg": ndcg_score(y_true, y_pred, k=k)}
        return perf_dict

    def eval(self, input_dict, verbose=False):
        """
        evaluation for edge regression task
        """
        y_true, y_pred = self._parse_and_check_input(input_dict)
        perf_dict = self._compute_metrics(y_true, y_pred)

        if verbose:
            print("INFO: Evaluation Results:")
            for eval_metric in input_dict["eval_metric"]:
                print(f"\t>>> {eval_metric}: {perf_dict[eval_metric]:.4f}")
        return perf_dict

    @property
    def expected_input_format(self):
        desc = "==== Expected input format of Evaluator for {}\n".format(self.name)
        if "mse" in self.valid_metric_list:
            desc += "{'y_pred': y_pred}\n"
            desc += "- y_pred: numpy ndarray or torch tensor of shape (num_edges, ). Torch tensor on GPU is recommended for efficiency.\n"
            desc += "y_pred is the predicted weight for edges.\n"
        else:
            raise ValueError("Undefined eval metric %s" % (self.eval_metric))
        return desc

    @property
    def expected_output_format(self):
        desc = "==== Expected output format of Evaluator for {}\n".format(self.name)
        if "mse" in self.valid_metric_list:
            desc += "{'mse': mse\n"
            desc += "- mse (float): mse score\n"
        else:
            raise ValueError("Undefined eval metric %s" % (self.eval_metric))
        return desc

`init(name)`

Parameters:

Name	Type	Description	Default
`name`	`str`	name of the dataset	required

Source code in tgb/nodeproppred/evaluate.py

def __init__(self, name: str):
    r"""
    Parameters:
        name: name of the dataset
    """
    self.name = name
    self.valid_metric_list = ["mse", "rmse", "ndcg"]
    if self.name not in DATA_EVAL_METRIC_DICT:
        raise NotImplementedError("Dataset not supported")

`eval(input_dict, verbose=False)`

evaluation for edge regression task

Source code in tgb/nodeproppred/evaluate.py

def eval(self, input_dict, verbose=False):
    """
    evaluation for edge regression task
    """
    y_true, y_pred = self._parse_and_check_input(input_dict)
    perf_dict = self._compute_metrics(y_true, y_pred)

    if verbose:
        print("INFO: Evaluation Results:")
        for eval_metric in input_dict["eval_metric"]:
            print(f"\t>>> {eval_metric}: {perf_dict[eval_metric]:.4f}")
    return perf_dict

`main()`

simple test for evaluator

Source code in tgb/nodeproppred/evaluate.py

def main():
    """
    simple test for evaluator
    """
    name = "tgbn-trade"
    evaluator = Evaluator(name=name)
    print(evaluator.expected_input_format)
    print(evaluator.expected_output_format)
    input_dict = {"y_true": y_true, "y_pred": y_pred, "eval_metric": ["mse"]}

    result_dict = evaluator.eval(input_dict)
    print(result_dict)

tgb.nodeproppred

NodePropPredDataset

edge_feat: Optional[np.ndarray] property

eval_metric: str property

full_data: Dict[str, Any] property

node_feat: Optional[np.ndarray] property

num_classes: int property

test_mask: np.ndarray property

train_mask: np.ndarray property

val_mask: np.ndarray property

__init__(name, root='datasets', meta_dict=None, preprocess=True)

download()

find_next_labels_batch(cur_t)

generate_processed_files()

generate_splits(full_data, val_ratio=0.15, test_ratio=0.15)

pre_process()

reset_label_time()

return_label_ts()

PyGNodePropPredDataset

dst: torch.Tensor property

edge_feat: torch.Tensor property

edge_label: torch.Tensor property

eval_metric: str property

num_classes: int property

src: torch.Tensor property

test_mask: torch.Tensor property

train_mask: torch.Tensor property

ts: torch.Tensor property

val_mask: torch.Tensor property

get(idx)

get_TemporalData()

get_label_time()

get_node_label(cur_t)

len()

process_data()

reset_label_time()

Evaluator

__init__(name)

eval(input_dict, verbose=False)

main()

`tgb.nodeproppred`

`NodePropPredDataset`

`edge_feat: Optional[np.ndarray]` `property`

`eval_metric: str` `property`

`full_data: Dict[str, Any]` `property`

`node_feat: Optional[np.ndarray]` `property`

`num_classes: int` `property`

`test_mask: np.ndarray` `property`

`train_mask: np.ndarray` `property`

`val_mask: np.ndarray` `property`

`init(name, root='datasets', meta_dict=None, preprocess=True)`

`download()`

`find_next_labels_batch(cur_t)`

`generate_processed_files()`

`generate_splits(full_data, val_ratio=0.15, test_ratio=0.15)`

`pre_process()`

`reset_label_time()`

`return_label_ts()`

`PyGNodePropPredDataset`

`dst: torch.Tensor` `property`

`edge_feat: torch.Tensor` `property`

`edge_label: torch.Tensor` `property`

`eval_metric: str` `property`

`num_classes: int` `property`

`src: torch.Tensor` `property`

`test_mask: torch.Tensor` `property`

`train_mask: torch.Tensor` `property`

`ts: torch.Tensor` `property`

`val_mask: torch.Tensor` `property`

`get(idx)`

`get_TemporalData()`

`get_label_time()`

`get_node_label(cur_t)`

`len()`

`process_data()`

`reset_label_time()`

`Evaluator`

`init(name)`

`eval(input_dict, verbose=False)`

`main()`