Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added __init__.py
Empty file.
14 changes: 14 additions & 0 deletions api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
The `process_mining.api` module provides tools for generating and manipulating event logs. These event logs can then be used for process mining.
It uses [PM4Py](https://github.com/process-intelligence-solutions/pm4py) library for process mining models and algorithms and uses `pandas` for data handling and manipulating.

---

## Overview

- `csv_generator.py` : Allows fake event logs to be created to run process mining.
- `io.py` : Provides event log loader and file naming convention.
- `process_visaul_generation` : Contains functions to create process visualizations from PM4PY.

"""

70 changes: 70 additions & 0 deletions api/csv_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import random
import pandas as pd
import uuid
from datetime import datetime, timedelta


# Function to generate fake data
def generate_fake_event_log(num_issues=1, num_events_per_issue=7, output_csv="generated_csv.csv", seed=42):
"""
Generates fake event log data and saves it to a CSV file. The event log always startes with event 'assigned' and ends
with event 'closed'. Events generated in between are randomly picked with options {'labeled', 'assigned', 'milestoned', 'closed', 'unlabeled', 'referenced', 'mentioned', 'subscribed'}.

Args:
- num_issues (int): Number of unique issues to generate.
- num_events_per_issue (int): Number of events per issue
- output_csv (str): The path where the CSV will be saved.
- seed (int): Seed to get same results

Returns:
- str: The path to the saved CSV file
"""
# Set seed given as parameter
random.seed(seed)
# Set base time for all times to be based off of
base_time = datetime(2020, 1, 1, 12, 0, 0)

all_events = ['labeled', 'assigned', 'milestoned', 'closed', 'unlabeled', 'referenced', 'mentioned', 'subscribed']
event_data = []

for issue_num in range(1, num_issues + 1):
# Always start with 'assigned' and end with 'closed'
issue_events = ['assigned']

# Choose (num_events_per_issue - 2) random events from the rest, excluding 'assigned' and 'closed'
middle_candidates = [e for e in all_events if e not in ['assigned', 'closed']]
middle_events = random.choices(middle_candidates, k=num_events_per_issue - 2)
issue_events += middle_events
issue_events.append('closed')

for event_num, event in enumerate(issue_events):
unique_id = str(uuid.uuid4())
created_at = base_time + timedelta(minutes=event_num * 5 + issue_num * 60)
created_at_str = created_at.strftime('%Y-%m-%dT%H:%M:%SZ')
event_data.append([unique_id, created_at_str, event, issue_num])

event_log_df = pd.DataFrame(event_data, columns=['id', 'created_at', 'event', 'issue_number'])
event_log_df.to_csv(output_csv, index=False)
print(f"Fake event log data saved to {output_csv}")
return output_csv

def modify_event(event_log_df, row_index, new_event):
"""
Modify events in the given event log dataframe.

Args:
- event_log_df (pd.DataFrame): The event log data frame.
- row_index (int): The index of the row to modify.
- new_event (str): The new event value to assign.

Returns:
- pd.DataFrame: The modified event log dataframe.
"""
# Ensure row_index is within the bounds of the dataframe
if row_index < 0 or row_index >= len(event_log_df):
raise IndexError(f"Row index {row_index} is out of bounds.")

event_log_df.loc[row_index, 'event'] = new_event
return event_log_df


45 changes: 45 additions & 0 deletions api/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from datetime import datetime
import pandas
import pm4py


def generate_timestamped_filename(prefix, extension="png"):
"""
Generates a unique file name with a given prefix and file extension.

Args:
- prefix (str): The prefix to be added to the file name.
- extension (str): The extension of the file (default is "png").

Returns:
- str: The generated file name with the current timestamp.
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"{prefix}_{timestamp}.{extension}"

def read_event_log(csv_path):
"""
Loads and formats an event log CSV for use with PM4Py.

Args:
- csv_path (str): Path to the event log CSV file.

Returns:
- pd.DataFrame: A formatted DataFrame ready for process mining.

Raises:
FileNotFoundError: If the CSV file does not exist.
ValueError: If required columns are missing.
"""
event_log = pandas.read_csv(csv_path)

if not {'issue_number', 'event', 'created_at'}.issubset(event_log.columns):
raise ValueError("CSV must contain 'issue_number', 'event', and 'created_at' columns.")

event_log['issue_number'] = event_log['issue_number'].astype(str)
event_log['created_at'] = pandas.to_datetime(event_log['created_at'], errors='coerce')

# PM4PY format
formatted_log = pm4py.format_dataframe(event_log, case_id='issue_number', activity_key='event', timestamp_key='created_at')

return formatted_log
Loading