diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..542d6ce --- /dev/null +++ b/api/__init__.py @@ -0,0 +1,14 @@ +""" +The `process_mining.api` module provides tools for generating and manipulating event logs. These event logs can then be used for process mining. +It uses [PM4Py](https://github.com/process-intelligence-solutions/pm4py) library for process mining models and algorithms and uses `pandas` for data handling and manipulating. + +--- + +## Overview + +- `csv_generator.py` : Allows fake event logs to be created to run process mining. +- `io.py` : Provides event log loader and file naming convention. +- `process_visaul_generation` : Contains functions to create process visualizations from PM4PY. + +""" + diff --git a/api/csv_generator.py b/api/csv_generator.py new file mode 100644 index 0000000..207a941 --- /dev/null +++ b/api/csv_generator.py @@ -0,0 +1,70 @@ +import random +import pandas as pd +import uuid +from datetime import datetime, timedelta + + +# Function to generate fake data +def generate_fake_event_log(num_issues=1, num_events_per_issue=7, output_csv="generated_csv.csv", seed=42): + """ + Generates fake event log data and saves it to a CSV file. The event log always startes with event 'assigned' and ends + with event 'closed'. Events generated in between are randomly picked with options {'labeled', 'assigned', 'milestoned', 'closed', 'unlabeled', 'referenced', 'mentioned', 'subscribed'}. + + Args: + - num_issues (int): Number of unique issues to generate. + - num_events_per_issue (int): Number of events per issue + - output_csv (str): The path where the CSV will be saved. + - seed (int): Seed to get same results + + Returns: + - str: The path to the saved CSV file + """ + # Set seed given as parameter + random.seed(seed) + # Set base time for all times to be based off of + base_time = datetime(2020, 1, 1, 12, 0, 0) + + all_events = ['labeled', 'assigned', 'milestoned', 'closed', 'unlabeled', 'referenced', 'mentioned', 'subscribed'] + event_data = [] + + for issue_num in range(1, num_issues + 1): + # Always start with 'assigned' and end with 'closed' + issue_events = ['assigned'] + + # Choose (num_events_per_issue - 2) random events from the rest, excluding 'assigned' and 'closed' + middle_candidates = [e for e in all_events if e not in ['assigned', 'closed']] + middle_events = random.choices(middle_candidates, k=num_events_per_issue - 2) + issue_events += middle_events + issue_events.append('closed') + + for event_num, event in enumerate(issue_events): + unique_id = str(uuid.uuid4()) + created_at = base_time + timedelta(minutes=event_num * 5 + issue_num * 60) + created_at_str = created_at.strftime('%Y-%m-%dT%H:%M:%SZ') + event_data.append([unique_id, created_at_str, event, issue_num]) + + event_log_df = pd.DataFrame(event_data, columns=['id', 'created_at', 'event', 'issue_number']) + event_log_df.to_csv(output_csv, index=False) + print(f"Fake event log data saved to {output_csv}") + return output_csv + +def modify_event(event_log_df, row_index, new_event): + """ + Modify events in the given event log dataframe. + + Args: + - event_log_df (pd.DataFrame): The event log data frame. + - row_index (int): The index of the row to modify. + - new_event (str): The new event value to assign. + + Returns: + - pd.DataFrame: The modified event log dataframe. + """ + # Ensure row_index is within the bounds of the dataframe + if row_index < 0 or row_index >= len(event_log_df): + raise IndexError(f"Row index {row_index} is out of bounds.") + + event_log_df.loc[row_index, 'event'] = new_event + return event_log_df + + diff --git a/api/io.py b/api/io.py new file mode 100644 index 0000000..00beb69 --- /dev/null +++ b/api/io.py @@ -0,0 +1,45 @@ +from datetime import datetime +import pandas +import pm4py + + +def generate_timestamped_filename(prefix, extension="png"): + """ + Generates a unique file name with a given prefix and file extension. + + Args: + - prefix (str): The prefix to be added to the file name. + - extension (str): The extension of the file (default is "png"). + + Returns: + - str: The generated file name with the current timestamp. + """ + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"{prefix}_{timestamp}.{extension}" + +def read_event_log(csv_path): + """ + Loads and formats an event log CSV for use with PM4Py. + + Args: + - csv_path (str): Path to the event log CSV file. + + Returns: + - pd.DataFrame: A formatted DataFrame ready for process mining. + + Raises: + FileNotFoundError: If the CSV file does not exist. + ValueError: If required columns are missing. + """ + event_log = pandas.read_csv(csv_path) + + if not {'issue_number', 'event', 'created_at'}.issubset(event_log.columns): + raise ValueError("CSV must contain 'issue_number', 'event', and 'created_at' columns.") + + event_log['issue_number'] = event_log['issue_number'].astype(str) + event_log['created_at'] = pandas.to_datetime(event_log['created_at'], errors='coerce') + + # PM4PY format + formatted_log = pm4py.format_dataframe(event_log, case_id='issue_number', activity_key='event', timestamp_key='created_at') + + return formatted_log \ No newline at end of file diff --git a/api/process_discovery.py b/api/process_discovery.py new file mode 100644 index 0000000..9a181e3 --- /dev/null +++ b/api/process_discovery.py @@ -0,0 +1,298 @@ +""" +# 📊 Process Discovery Module + +Module for visualizing process event logs using **PM4Py**. + +--- + +## Features + +This module provides functions to: + +- Extract **start** and **end** activities from a CSV log. +- Generate and save various visualizations: + - **Process tree** + - **Process graph (with filtering)** + - **Performance-based DFG** (Directed Flow Graph) + - **Occurrence-based DFG** (Directed Flow Graph) + - **Petri net** + +All functions assume the CSV argument has the following columns [`io.read_event_log`](io.html#read_event_log): +- 'issue_number': Unique case identifier. +- 'event': Activity name or label. +- 'created_at': Timestamp of the event. + +--- + +## Dependencies + +- `pandas` +- `pm4py` + +--- + +## Example + +```python +from process_discovery import start_end_activities, generate_tree_inductive + +start_acts, end_acts = start_end_activities("data/log.csv") + +generate_tree_inductive("data/log.csv") +``` +""" + +import os +from .io import generate_timestamped_filename, read_event_log +import pm4py + + +def start_end_activities(csv_path): + """ + Reads an event log from a CSV file and returns its start and end activities. + + Assumes the CSV has required columns, see here: [`io.read_event_log`](io.html#read_event_log) + + + Args: + - csv_path (str): Path to the event log CSV file. + + Returns: + tuple[dict, dict]: A tuple containing: + - dict: Start activities and their counts. + - dict: End activities and their counts. + + Raises: + FileNotFoundError: If the CSV path is invalid + ValueError: If required columns are missing. + """ + event_log = read_event_log(csv_path) + + start_activities = pm4py.get_start_activities(event_log) + end_activities = pm4py.get_end_activities(event_log) + return start_activities, end_activities + + +def generate_tree_inductive(csv_path, output_dir=None, action="view"): + """ + Generates and saves a process tree visualization from an event log. Uses the Inductive Miner algorithm. + + Assumes the CSV has required columns, see here: [`io.read_event_log`](io.html#read_event_log) + + Args: + - csv_path (str): Path to the event log CSV file. + - output_dir (str, optional): Directory where the PNG image will be saved. Required if action is 'save' or 'both'. + - action (str): One of {'view', 'save', 'both'}. + - 'view': Display the graph in the current environment. (Default action) + - 'save': Save the graph to the specified output_dir. + - 'both': Display and save the graph. + + Returns: + None + + Side Effects: + Saves a PNG file of the performance DFG to the specified output directory if action is 'save' or 'both'. + Displays the visualization if action is 'view' or 'both'. + + Raises: + FileNotFoundError: If the CSV path is invalid. + ValueError: If required columns are missing, or output_dir is not provided when action is 'save' or 'both'. + """ + event_log = read_event_log(csv_path) + + process_tree = pm4py.discover_process_tree_inductive(event_log) + + if action == "view" or action == "both": + pm4py.view_process_tree(process_tree) + + if action == "save" or action == "both": + if output_dir is not None: + if not os.path.exists(output_dir): + raise FileNotFoundError(f"The specified directory does not exist: {output_dir}") + file_name = generate_timestamped_filename("process_tree") + output_path = os.path.join(output_dir, file_name) + pm4py.save_vis_process_tree(process_tree, output_path) + print(f"Tree generated and saved to {output_path}") + else: + raise ValueError("Output directory must be specified for saving.") + + +def generate_graph_inductive(csv_path, output_dir=None, action="view", noise_threshold=0.0): + """ + Generates and saves a filtered process graph visualization from an event log, + using a noise threshold to remove less relevant activities. Uses the Inductive Miner algorithm. + + Assumes the CSV has required columns, see here: [`io.read_event_log`](io.html#read_event_log) + + Args: + - csv_path (str): Path to the event log CSV file. + - output_dir (str, optional): Directory where the PNG image will be saved. Required if action is 'save' or 'both'. + - noise_threshold (float): Threshold for filtering noise in the process graph (default is 0.0). + - action (str): One of {'view', 'save', 'both'}. + - 'view': Display the graph in the current environment. (Default action) + - 'save': Save the graph to the specified output_dir. + - 'both': Display and save the graph. + + Returns: + None + + Side Effects: + Saves a PNG file of the filtered process graph to the specified output directory if action is 'save' or 'both'. + Displays the visualization if action is 'view' or 'both'. + + Raises: + FileNotFoundError: If the CSV path is invalid. + ValueError: If required columns are missing, output_dir is not provided when needed, or noise_threshold is invalid. + """ + if not (0.0 <= noise_threshold <= 1.0): + raise ValueError("Noise threshold must be between 0.0 and 1.0.") + + event_log = read_event_log(csv_path) + + bpmn_filtered = pm4py.discover_bpmn_inductive(event_log, noise_threshold) + + if action == "view" or action == "both": + pm4py.view_bpmn(bpmn_filtered) + + if action == "save" or action == "both": + if output_dir is not None: + if not os.path.exists(output_dir): + raise FileNotFoundError(f"The specified directory does not exist: {output_dir}") + file_name = generate_timestamped_filename("process_graph_inductive") + output_path = os.path.join(output_dir, file_name) + pm4py.save_vis_bpmn(bpmn_filtered, output_path, format="png") + print(f"Graph generated and saved to {output_path}") + else: + raise ValueError("Output directory must be specified for saving.") + + +def generate_performance_graph_dfg(csv_path, output_dir=None, action="view"): + """ + Generates and saves a performance-related Directed Flow Graph (DFG) visualization. + + Assumes the CSV has required columns, see here: [`io.read_event_log`](io.html#read_event_log) + + Args: + - csv_path (str): Path to the event log CSV file. + - output_dir (str, optional): Directory where the PNG image will be saved. Required if action is 'save' or 'both'. + - action (str): One of {'view', 'save', 'both'}. + - 'view': Display the graph in the current environment. (Default action) + - 'save': Save the graph to the specified output_dir. + - 'both': Display and save the graph. + + Returns: + None + + Side Effects: + Saves a PNG file of the performance DFG to the specified output directory if action is 'save' or 'both'. + Displays the visualization if action is 'view' or 'both'. + + Raises: + FileNotFoundError: If the CSV path is invalid. + ValueError: If required columns are missing, or output_dir is not provided when action is 'save' or 'both'. + """ + event_log = read_event_log(csv_path) + + performance_dfg, start_activities, end_activities = pm4py.discover_performance_dfg(event_log, case_id_key='issue_number', activity_key='event', timestamp_key='created_at') + + if action == "view" or action == "both": + pm4py.view_performance_dfg(performance_dfg, start_activities, end_activities) + if action == "save" or action == "both": + if output_dir is not None: + if not os.path.exists(output_dir): + raise FileNotFoundError(f"The specified directory does not exist: {output_dir}") + file_name = generate_timestamped_filename("performance_dfg") + output_path = os.path.join(output_dir, file_name) + pm4py.save_vis_performance_dfg(performance_dfg, start_activities, end_activities) + print(f"Graph generated and saved to {output_path}") + else: + raise ValueError("Output directory must be specified for saving.") + + +def generate_count_graph_dfg(csv_path, output_dir=None, action="view"): + """ + Generates and saves an occurrence-based Directed Flow Graph (DFG) visualization, + which shows the frequency of activity transitions. + + Assumes the CSV has required columns, see here: [`io.read_event_log`](io.html#read_event_log) + + Args: + - csv_path (str): Path to the event log CSV file. + - output_dir (str): Directory where the PNG image will be saved. + - action (str): One of {'view', 'save', 'both}. + - 'view': Display the graph in the current enviroment. (Default action) + - 'save': Save the graph to the specified output_dir. + - 'both': Display and save the graph. + + Returns: + None + + Side Effects: + Saves a PNG file of the occurrence DFG to the specified output directory if action is 'save' or 'both'. + Displays the visualization if action is 'view' or 'both'. + + Raises: + FileNotFoundError: If the CSV path is invalid. + ValueError: If required columns are missing, or output_dir is not provided when action is 'save' or 'both'. + """ + event_log = read_event_log(csv_path) + + # Run DFG and generate graph. + dfg, start_activities, end_activities = pm4py.discover_dfg(event_log, case_id_key='issue_number', activity_key='event', timestamp_key='created_at') + + if action == "view" or action == "both": + pm4py.view_dfg(dfg, start_activities, end_activities) + if action == "save" or action == "both": + if output_dir is not None: + if not os.path.exists(output_dir): + raise FileNotFoundError(f"The specified directory does not exist: {output_dir}") + + file_name = generate_timestamped_filename("occurrence_dfg") + output_path = os.path.join(output_dir, file_name) + pm4py.save_vis_dfg(dfg, start_activities, end_activities, output_path) + print(f"Graph generated and saved to {output_path}") + else: + raise ValueError("Output directory must be specified for saving.") + + +def generate_petri_net_inductive(csv_path, output_dir=None, action="view"): + """ + Generates and saves a Petri net visualization from an event log using the Inductive Miner. + + Assumes the CSV has required columns, see here: [`io.read_event_log`](io.html#read_event_log) + + Args: + - csv_path (str): Path to the event log CSV file. + - output_dir (str): Directory where the PNG image will be saved. + - action (str): One of {'view', 'save', 'both}. + - 'view': Display the graph in the current enviroment. (Default action) + - 'save': Save the graph to the specified output_dir. + - 'both': Display and save the graph. + + Returns: + None + + Side Effects: + Saves a PNG file of the occurrence DFG to the specified output directory if action is 'save' or 'both'. Displays the visualization if action is 'view' or 'both'. + + Raises: + FileNotFoundError: If the CSV path is invalid. + ValueError: If required columns are missing, or output_dir is not provided when action is 'save' or 'both'. + """ + event_log = read_event_log(csv_path) + + net, im, fm = pm4py.discover_petri_net_inductive(event_log, case_id_key='issue_number', activity_key='event', timestamp_key='created_at') + + if action == "view" or action == "both": + pm4py.view_petri_net(net, im, fm) + if action == "save" or action == "both": + if output_dir is not None: + if not os.path.exists(output_dir): + raise FileNotFoundError(f"The specified directory does not exist: {output_dir}") + + file_name = generate_timestamped_filename("petri_net") + output_path = os.path.join(output_dir, file_name) + pm4py.save_vis_petri_net(net, im, fm, output_path, format="png") + print(f"Graph generated and saved to {output_path}") + else: + raise ValueError("Output directory must be specified for saving.") \ No newline at end of file diff --git a/env.yml b/env.yml new file mode 100644 index 0000000..9391834 --- /dev/null +++ b/env.yml @@ -0,0 +1,12 @@ +name: process_mining +channels: + - conda-forge + - defaults +dependencies: + - python=3.10.16 + - pandas + - pip + - ipykernel + - pip: + - pm4py +prefix: /opt/anaconda3/envs/process_mining \ No newline at end of file diff --git a/notebooks/issue_event_processing.ipynb b/notebooks/issue_event_processing.ipynb new file mode 100644 index 0000000..b1a3e9f --- /dev/null +++ b/notebooks/issue_event_processing.ipynb @@ -0,0 +1,389 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "Event logs are the foundation of process mining. They capture records of activities within a system, providing information about when actions occur and what those actions are. For example, in GitHub Issue Events, actions such as assigning users, labeling issues, and closing issues are recorded. Together, these events tell the full story of the process from start to finish. Event logs can be transformed into differnt process graphs, which visually represent the flow of activities and how they connect. These graphs make it easier to identify inefficiencies, bottlenecks, and deviations from expected workflows. They provide valuable insights for process improvement and optimization. \n", + "\n", + "In this notebook, we demonstrate how to create process graphs using GitHub Issue Event Logs. Specifically, we use [Kaiaulu R package](https://github.com/sailuh/kaiaulu) to download and parse GitHub Issue Events. Capturing the lifecycle of an issue from creation, through assignment, discussion, and to closure.\n", + "Although these example's will focus on GitHub data, the techniques shown here can be applied to any event log that follows a similar format. This notebook serves as a demonstration of how event data can be prepared and explored for process mining. \n", + "\n", + "Note: For more information on process mining it is reccommeneded you refer to this [video](https://www.youtube.com/watch?v=XLHtvt36g6U&t=1181s). Sailuh/process mining relies heavily on the python package [pm4py](https://github.com/process-intelligence-solutions/pm4py), refer to the documentation for more detailed information regarding algorithms and models. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirements\n", + "\n", + "- Gihub Access Token \n", + "- [Kaiaulu](https://github.com/sailuh/kaiaulu) R package to download data via CLI\n", + "- Python environment with [pm4py](https://github.com/process-intelligence-solutions/pm4py) installed\n", + "- Faker installed for CSV data generation\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Python Imports\n", + "\n", + "import sys\n", + "import os\n", + "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))\n", + "import pandas as pd\n", + "import subprocess\n", + "\n", + "from api.csv_generator import *\n", + "from api.process_discovery import *\n", + "from api.io import *\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Overview\n", + "\n", + "To begin process mining, we first need the data in a properly formatted .csv file. This is where [Kaiaulu](https://github.com/sailuh/kaiaulu) comes in. Kaiaulu is a R package for software repository mining that provides a set of functions organized as an API. These functions allow us to downlaod and parse GitHub issue event data in our case. **In our examples we'll download and analysis issue event data from the Kaiaulu GitHub repository itself**. Rather than calling these functions from the project we will use an executable command-line interface (CLI) that is provided by kaiaulu to keep the processes indepenedent. Because this notebook and Kaiaulu are self contained, we assume the following folder organization: \n", + "\n", + "- ```kaiaulu/kaiaulu/exec/ghevents.R```\n", + "- ```kaiaulu/rawdata/```\n", + "- ```process_mining/notebooks/issue_event_processing.ipynb```\n", + "\n", + "Note: In this structure, the outer kaiaulu/ folder represents the project scope, while the inner kaiaulu/ contains the actural R package with functions. It is assumed the process_mining/ folder and the outer kaiaulu/ folder are together in the same folder.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### GitHub Events\n", + "\n", + "The executable CLI requires a project configuration file to define the scope of the data we want to collect. Because we want to download Kaiaulu GitHub data we will be using the correponding **kaiaulu.yml** config file. \n", + "We are interested in the three fields from the config: \n", + "- ```repo``` : the name of the GitHub repo\n", + "- ```owner``` : the GitHub user/organization\n", + "- ```issue_event``` : the save path for the raw issue event data (also default process graph save folder)\n", + "\n", + "By default the issue_event path points to a directory called rawdata, located outside both the process_mining and Kaiaulu folders. The path structure is project dependent, hence the relative path is : **../rawdata/kaiaulu/sailuh_kaiaulu/issue_event/**\n", + "\n", + "Finally, the CLI requires a [GitHub API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) to increase the request rate limite during data downloads. This token should be saved by default to the expected path **~/.ssh/github_token**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cwd = os.getcwd()\n", + "\n", + "kaiaulu_path = os.path.abspath(os.path.join(\"..\", \"..\", \"kaiaulu\", \"kaiaulu\"))\n", + "token_path = \"~/.ssh/github_token\"\n", + "\n", + "os.chdir(kaiaulu_path)\n", + "# To download use the download command and specify the .\n", + "command = f\"Rscript exec/ghevents.R download conf/kaiaulu.yml --token_path={token_path}\"\n", + "subprocess.run(command, shell=True, check=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have the GitHub issue event rawdata (JSON) we may now parse it into a event log in .csv format. By default the .csv is saved to the folder where this package was downloaded. For example if it was downloaded to /Desktop/process_mining/... the .csv will have the following path /Desktop/issue_output.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_dir = os.path.abspath(os.path.join(cwd, \"..\",\"..\"))\n", + "\n", + "\n", + "# To parse use the parse command and specify the .\n", + "command = f\"Rscript exec/ghevents.R parse conf/kaiaulu.yml {output_dir}/issue_output.csv\"\n", + "subprocess.run(command, shell=True, check=True)\n", + "\n", + "os.chdir(cwd)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have a parsed event log in CSV format, we can load it using pandas and preview the first five rows to get a sense of the data structure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "csv_path = os.path.join(output_dir, \"issue_output.csv\")\n", + "\n", + "df_issues = pd.read_csv(csv_path)\n", + "print(df_issues.head(5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, the parsed event log conatins all the right data to see the entire process. However, just by reading each event induviually we can see it doens't give us good picture of the entire process. Process mining will solve this by stitching the events together into a visual. It provides a overview of how processes actualy unfold. \n", + "\n", + "It is important to note for process mining we will only be interested in the ```created_at```, ```event```, and ```issue_number``` columns. The others are not needed to generate graphs. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Process Mining GitHub Events\n", + "\n", + "Now we will generate the visualizations in practice using the data we dowloaded and parsed from the Kaiaulu Github issues. \n", + "\n", + "The process tree is our first visual we will generate. It uses the Inductive Miner Algorithm to generate a hierarchical visualization of the workflow. Capturing the sequences and dependecies of events. The algorithm is based on process discovery, and will highligh decision points and parallel activities. It is important for understanding the overall flow and structure of events and identifiying transitions and potential ineffcientices in the process. \n", + "\n", + "Note: You specifiy action as **view**, **save**, or **both** to determine what happens to the visualization. By default the generate functions will only view the graph. For our example we specify output_dir as the folder where process_mining was downloaded. This means for actions **save** and **both**, the images will be saved to the folder where process_mining was downloaded unless otherwise specified. This is the same location where the CSV was parsed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "generate_tree_inductive(csv_path, output_dir, action=\"both\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The visual is generated in the notebook and we can find the file located outside of process_mining.\n", + "\n", + "We will create a BPMN (Buiness Process Model and Notation) with the same inductive miner from the last part creating the tree.\n", + "The BPMN model is a standard graphical representation of the process, it is the simpilest graph we will generate. It highlights the sequences of activies, decision points, and possible parallel paths in the process model. Compared to the tree it is easier to identify ineffcienceies and deviation the process flow from the simplistic nature. \n", + "\n", + "Filtering can also be applied to the process. By applying filtering we refine the model to focus on the most signifcant part of the process. It does this by reducing the noise and making the key patterns more apparent. This noise could be events that are not required to complete the process or that stray away from the main process for example. You can set the ```noise_threshold``` as a parameter, note the default is 0.0. We will set a 0.8 ```noise_threshold``` to reduce the graph size.\n", + "\n", + "From here on we will only view the visuals to not create unncecesary files, but as stated above this parameter can be changed to **save** or **both**. You may need to expand the visual to see events." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generate_graph_inductive(csv_path, noise_threshold=0.8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next the time edge weight graph visualizes the process flow with a focus on the time spent between activities. It uses the Performance DFG (Directly-Follows Graph). Every edge has a value which represents the interval between consecutive events. These time-based relations are useful to show where the process may need to be optimized to reduce the overall cycle time in the process. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generate_performance_graph_dfg(csv_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are starting to get some complex grahps. This is because the overall process we are looking at (Kaiaulu Github Issue Events) is very large. It contains 300+ GitHub issues and a total 3600+ events at the creation of this notebook. At the end of this notebook provides graphs with minimal processes to understand the graphs better.\n", + "\n", + "Another DFG (Directly-Follows Graph) generates the graph with occurrence edges. The weight is based on the frequency of transitions between nodes. This model highlights the most common paths, allowing for a better understanding of the domnant workflow patterns. It can help identify redudent steps in the process. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generate_count_graph_dfg(csv_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, the Petri Net is a commonly used formal model used to represent workflows in automation and process mining. Unlike simplier models such as BPMN, Petri Nets represent conditions, tokens, and transitions. Instead of just showing the overview of the process. \n", + "\n", + "Important Symbols: \n", + "\n", + "- Circle with black center dot: Marks the start of the process.\n", + "- Circle with black square: Marks the end of the process.\n", + "- Empty circles: State or conditions in the process.\n", + "- Black boxes: Transitions that may be considered special. Example: Silent steps that don't correspond to events but may be needed for logical execution of the process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generate_petri_net_inductive(csv_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A useful function to call in conjunction when generating graphs is **start_end_activities(csv_path)**. It finds the start and end activites for the process. \n", + "The start and end activies are the first and last recorded events in the process workflow. Marking the entry and exit points where no prior or further events occur. These activies are useful for reasoning through process grpahs and finding inefficencies in a non-graphical manner." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "start_end_activities(csv_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Understanding Process Graphs\n", + "\n", + "Now that we have seen all the process mining functionally we can scale the process down to better understand what is happening. There will be three \"experiements\" we run.\n", + "\n", + "For these we will need small event logs. These are artificially generated with functiuons from ```api/csv_generator.py```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generated_csv_path = f\"{output_dir}/generated_csv.csv\"\n", + "\n", + "generate_fake_event_log(num_issues=1, num_events_per_issue=7, output_csv=generated_csv_path, seed=42)\n", + "\n", + "event_log_df = pd.read_csv(generated_csv_path)\n", + "print(event_log_df.head(7))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The event log is very straight forward with each event appearing in order of ```created_at```. We will first generate a BPMN graph with no filtering to get a baseline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generate_graph_inductive(generated_csv_path, noise_threshold=0.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will re-run the function to generate the graph. Notice that the resulting graph remains unchanged, demonstrating that the Inductive Miner algorithm produces consistent and reproducible results when applied to the same event log." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "generate_graph_inductive(generated_csv_path, noise_threshold=0.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we will change one event from the event log demonstrating the change to the process graph created. We are changing the event on row 4 to be mentioned instead of milestoned. This will shift the cycle in the process to the right changing it to \"mentioned\". \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "modify_event(event_log_df, row_index=4, new_event=\"mentioned\")\n", + "print(event_log_df.head(7))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "event_log_df.to_csv(f\"{output_dir}/modified_csv.csv\")\n", + "\n", + "generate_graph_inductive(f\"{output_dir}/modified_csv.csv\", noise_threshold=0.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Conclusion\n", + "\n", + "Analyzing long and complex event logs can quickly become overwhelming, especially when trying to understand the underlying process structure. To manage this complexity, it is recommended to start with a smaller subset of data and gradually build up. A practical approach is to begin with only a few issues and examine how they translate into the process graph. This allows for clearer insights and easier debugging as the process grows.\n", + "\n", + "This can be done by reading the csv into a pandas DataFrame and manipulating the data with Python. Alternatively the CSV can be modified with Excel or Google Sheets. The process graph generation functions remain flexible by requiring a CSV input, allowing them to be called from outside Python and integrated with other tools or systems.\n", + "\n", + "By incrementally expanding the event log and re-generating the visualization, users can develop a deeper, more manageable understanding of the overall process behavior." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "process_mining", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}