| | |
| | """ |
| | Martinez-Gil, J. (2025). Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks. |
| | International Journal of Software Engineering and Knowledge Engineering, 35(05), 657–678. |
| | """ |
| |
|
| | import numpy as np |
| | import matplotlib.pyplot as plt |
| | from sklearn.decomposition import PCA |
| | from transformers import RobertaTokenizer, RobertaModel |
| | import torch |
| | import gradio as gr |
| | from io import BytesIO |
| | from PIL import Image |
| |
|
| | |
| | tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base", cache_dir="models/") |
| | model = RobertaModel.from_pretrained("microsoft/graphcodebert-base", cache_dir="models/") |
| |
|
| | |
| | default_code_1 = """def bubble_sort(arr): |
| | n = len(arr) |
| | for i in range(n): |
| | for j in range(0, n-i-1): |
| | if arr[j] > arr[j+1]: |
| | arr[j], arr[j+1] = arr[j+1], arr[j] |
| | return arr""" |
| |
|
| | default_code_2 = """def quick_sort(arr, low, high): |
| | if low < high: |
| | pi = partition(arr, low, high) |
| | quick_sort(arr, low, pi - 1) |
| | quick_sort(arr, pi + 1, high) |
| | |
| | def partition(arr, low, high): |
| | i = (low - 1) |
| | pivot = arr[high] |
| | for j in range(low, high): |
| | if arr[j] <= pivot: |
| | i += 1 |
| | arr[i], arr[j] = arr[j], arr[i] |
| | arr[i+1], arr[high] = arr[high], arr[i+1] |
| | return (i + 1)""" |
| |
|
| | |
| | def get_token_embeddings(code): |
| | inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding=True) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy() |
| | tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze()) |
| | return token_embeddings, tokens |
| |
|
| | |
| | def compare_algorithms(code1, code2): |
| | emb1, tokens1 = get_token_embeddings(code1) |
| | emb2, tokens2 = get_token_embeddings(code2) |
| |
|
| | combined = np.concatenate([emb1, emb2], axis=0) |
| | pca = PCA(n_components=2) |
| | coords = pca.fit_transform(combined) |
| |
|
| | plt.figure(figsize=(6, 5), dpi=150) |
| | plt.scatter(coords[:len(tokens1), 0], coords[:len(tokens1), 1], color='red', label="Code 1", s=20) |
| | plt.scatter(coords[len(tokens1):, 0], coords[len(tokens1):, 1], color='blue', label="Code 2", s=20) |
| | plt.legend() |
| | plt.xticks([]); plt.yticks([]); plt.grid(False) |
| |
|
| | buf = BytesIO() |
| | plt.savefig(buf, format='png', bbox_inches='tight') |
| | plt.close() |
| | buf.seek(0) |
| | return Image.open(buf) |
| |
|
| | interface = gr.Interface( |
| | fn=compare_algorithms, |
| | inputs=[ |
| | gr.Code(language="python", value=default_code_1, label="Code 1"), |
| | gr.Code(language="python", value=default_code_2, label="Code 2") |
| | ], |
| | outputs=gr.Image(type="pil", label="Token Embedding PCA"), |
| | title="GraphCodeBERT Token Embedding Comparison", |
| | description="Edit or paste two Python code snippets. This tool compares their token-level embeddings using GraphCodeBERT and PCA.", |
| | article=""" |
| | **Citation** |
| | Martinez-Gil, J. (2025). *Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks.* International Journal of Software Engineering and Knowledge Engineering, 35(05), 657–678. |
| | |
| | **GitHub Repository** |
| | [View Source on GitHub](https://github.com/jorge-martinez-gil/graphcodebert-interpretability) |
| | """ |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | interface.launch() |
| |
|
| |
|
| |
|
| |
|