Analyzing Program Similarities and Differences Across Multiple Languages

FLAGreserachlaborato 132 views 43 slides Sep 10, 2024
Slide 1
Slide 1 of 103
Slide 1
1
Slide 2
2
Slide 3
3
Slide 4
4
Slide 5
5
Slide 6
6
Slide 7
7
Slide 8
8
Slide 9
9
Slide 10
10
Slide 11
11
Slide 12
12
Slide 13
13
Slide 14
14
Slide 15
15
Slide 16
16
Slide 17
17
Slide 18
18
Slide 19
19
Slide 20
20
Slide 21
21
Slide 22
22
Slide 23
23
Slide 24
24
Slide 25
25
Slide 26
26
Slide 27
27
Slide 28
28
Slide 29
29
Slide 30
30
Slide 31
31
Slide 32
32
Slide 33
33
Slide 34
34
Slide 35
35
Slide 36
36
Slide 37
37
Slide 38
38
Slide 39
39
Slide 40
40
Slide 41
41
Slide 42
42
Slide 43
43
Slide 44
44
Slide 45
45
Slide 46
46
Slide 47
47
Slide 48
48
Slide 49
49
Slide 50
50
Slide 51
51
Slide 52
52
Slide 53
53
Slide 54
54
Slide 55
55
Slide 56
56
Slide 57
57
Slide 58
58
Slide 59
59
Slide 60
60
Slide 61
61
Slide 62
62
Slide 63
63
Slide 64
64
Slide 65
65
Slide 66
66
Slide 67
67
Slide 68
68
Slide 69
69
Slide 70
70
Slide 71
71
Slide 72
72
Slide 73
73
Slide 74
74
Slide 75
75
Slide 76
76
Slide 77
77
Slide 78
78
Slide 79
79
Slide 80
80
Slide 81
81
Slide 82
82
Slide 83
83
Slide 84
84
Slide 85
85
Slide 86
86
Slide 87
87
Slide 88
88
Slide 89
89
Slide 90
90
Slide 91
91
Slide 92
92
Slide 93
93
Slide 94
94
Slide 95
95
Slide 96
96
Slide 97
97
Slide 98
98
Slide 99
99
Slide 100
100
Slide 101
101
Slide 102
102
Slide 103
103

About This Presentation

Slides for the keynote presentation given by @ncardoz at the
SQAMIA24 Workshop on Software quality, Analysis, Monitoring, Improvement, and Applications 2024 in Novi Sad


Slide Content

Analyzing Program Similarities
and Differences Across Multiple
Languages
Nicolás Cardozo
Universidad de los Andes, Bogotá - Colombia
[email protected]
@ncardoz
SQAMIA - 9/9/2024
Novi Sad, Serbia

2
int foo(int[] a) {
int n = a.size;
int j = n;
bool s = true;
while (s) {
s = false;
for (int i = 1; i < j; i++) {
if (a[i] < a[i - 1]) {
int t = a[i];
a[i] = a[i - 1];
a[i - 1] = t;
s = true;
}
}
j--;
}
return 0;
}
void foo(int arr[], int l, int m, int r){
int k;
int n1 = m - l + 1;
int n2 = r - m;
int L[n1], R[n2];
for(int i = 0; i < n1; i++)
L[i] = arr[l + i];
for(int j = 0; j < n2; j++)
R[j] = arr[m + 1+ j];
int i = 0;
int j = 0;
k = l;
while (i < n1 && j < n2) {
if (L[i] <= R[j]) {
arr[k] = L[i];
i++;
}
else {
arr[k] = R[j];
j++;
}
k++;
}
while (i < n1) {
arr[k] = L[i];
i++;
k++;
}
while (j < n2) {
arr[k] = R[j];
j++;
k++;
}
}
void bar(int arr[], int l, int r) {
if (l < r) {
int m = l + (r - l) / 2;
bar(arr, l, m);
bar(arr, m + 1, r);
foo(arr, l, m, r);
}
}
?

3
int foo(int[] a) {
int n = a.size;
int j = n;
bool s = true;
while (s) {
s = false;
for (int i = 1; i < j; i++) {
if (a[i] < a[i - 1]) {
int t = a[i];
a[i] = a[i - 1];
a[i - 1] = t;
s = true;
}
}
j--;
}
return 0;
}
foo(List<int> array) {
int lengthOfArray = array.length;
for (int i=0; i<lengthOfArray - 1; i++) {
for (int j=0; j<lengthOfArray-i-1; j++) {
if (array[j] > array[j + 1]) {
// Swapping using temporary variable
int temp = array[j];
array[j] = array[j + 1];
array[j + 1] = temp;
}
}
}
return (array);
}
?

4
(define (foo list)
(cond ((null? (cdr list)) list)
((> (car list) (cadr list))
(cons (cadr list) (foo (cons (car list)
(cdr (cdr list))))))
(else (cons (car list) (foo (cdr list))))
)
)
int foo(int[] a) {
int n = a.size;
int j = n;
bool s = true;
while (s) {
s = false;
for (int i = 1; i < j; i++) {
if (a[i] < a[i - 1]) {
int t = a[i];
a[i] = a[i - 1];
a[i - 1] = t;
s = true;
}
}
j--;
}
return 0;
}
?

5
int foo(int[] a) {
int n = a.size;
ans = 0;
for(int i = 0; i < n; i++) {
if(a[i] % 2 != 0)
ans += a[i];
}
return 0;
}
void foo(int *a) {
int n = a.size;
for(size_t i = 1; i < n; ++i) {
int tmp = a[i];
size_t j = i;
while(j > 0 && tmp < a[j - 1]) {
a[j] = a[j - 1];
—j;
}
a[j] = tmp;
}
}
?

6

7

8
What constitutes the similarities or
differences between two programs?

8
What constitutes the similarities or
differences between two programs?
Behavior
Syntax
Semantics
Intention of
abstractions

9
The road ahead
How to detect
similar code
How to measure
similarities/differences
How to track and
manage divergence
Created by PixelX
from Noun Project
How to manage all of this across multiple
languages, using different paradigms

Similarities detection

11
Code clones
Python code snippets
i = 1 i = 1

11
Code clones
Type 1 clones
Python code snippets
i = 1 i = 1

11
Code clones
Type 1 clones
Type 2 clones
Python code snippets
i = 1
j = 2
i = 1
l = 2

11
Code clones
Type 1 clones
Type 2 clones
Type 3 clones
Python code snippets
i = 1
j = 2
i = i + 1
i = 1
l = 2
i += 1

11
Code clones
Type 1 clones
Type 2 clones
Type 3 clones
Type 4 clones
Python code snippets
i = 1
j = 2
i = i + 1
for i in range(1,10):
i += 1
i = 1
l = 2
i += 1
i = 1 + 9

11
Code clones
Type 1 clones
Type 2 clones
Type 3 clones
Type 4 clones
Python code snippets
i = 1
j = 2
i = i + 1
for i in range(1,10):
i += 1
i = 1
l = 2
i += 1
i = 1 + 9
}
Focus of clone
detection

12
How to detect clones
Token or lexicalTree Graph HybridTextual
Textual
OUT OF STEP is a hybrid approach combining a
generalized tree structure and the textual
representation of tokens

13
Type 1 clones
Type 2 clones
Type 3 clones
Type 4 clones
Kotlin
var i = 1
val j = 2
i = i + 1
for(i in 1..10) {
i += 1
}
int i = 1;
int l = 2;
i += 1;
i = 1 + 9;
}
Focus of clone
detection
Dart
OUT OF STEP

14
Clone analysis process
Grammar definition

14
Clone analysis process
Grammar definition
ANTLR 4
generation

14
Clone analysis process
Grammar definition
ANTLR 4
generation
Lexer
parser

14
Clone analysis process
Grammar definition
ANTLR 4
generation
sum 1
+=
int num
eCST
Lexer
parser

14
Clone analysis process
Grammar definition
ANTLR 4
generation
sum 1
+=
int num
eCST
Lexer
parser
OutOfStep

14
Clone analysis process
Grammar definition
ANTLR 4
generation
sum 1
+=
int num
eCST
Lexer
parser
OutOfStep
Type 1
Type 2Type 3
Clone
detection

15
Abstract Syntax Trees
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}

15
Abstract Syntax Trees
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}
File

15
Abstract Syntax Trees
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}
File
fun_decl fun_body

15
Abstract Syntax Trees
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}
sum 1
+=
int num
File
fun_decl fun_body
variablefor
literalliteral
for_parts
posfix
parameter
statement
i 1 3

Abstract Syntax Trees
int ten = 10;val ten = 10

Abstract Syntax Trees
int ten = 10;val ten = 10

17
Enriched Concrete Syntax Tree (eCST)
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}

17
Enriched Concrete Syntax Tree (eCST)
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}
File
Advance nodes

17
Enriched Concrete Syntax Tree (eCST)
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}
File
fun_decl fun_body
Universal nodes Advance nodes Stop nodes

17
Enriched Concrete Syntax Tree (eCST)
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}
int num
File
fun_decl fun_body
literalliteral
parameter
Universal nodes Advance nodes Stop nodes

17
Enriched Concrete Syntax Tree (eCST)
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}
int num
File
fun_decl fun_body
assignment
literalliteral
parameter
Universal nodes Advance nodes Stop nodes

17
Enriched Concrete Syntax Tree (eCST)
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}
int num
File
fun_decl fun_body
assignmentloop
literalliteral
body
parameter
Universal nodes Advance nodes Stop nodes

17
Enriched Concrete Syntax Tree (eCST)
main(int num) {
int sum = 0;
for (i in 1..num){
// Count numbers
// sum = sum + 1;
sum += 1;
}
}
sum 1
+=
int num
File
fun_decl fun_body
assignment
literal
loop
literal
literalliteral
body
posfix
parameter
Universal nodes Advance nodes Stop nodes

18
Universal nodes represent syntactic grammar rules abstracting
programming languages’ concepts
Enriched Concrete Syntax Tree (eCST)

18
Advance nodes are used to give groupings and a more
hierarchical structure to eCSTs, easing the comparison across
language grammars
Universal nodes represent syntactic grammar rules abstracting
programming languages’ concepts
Enriched Concrete Syntax Tree (eCST)

18
Advance nodes are used to give groupings and a more
hierarchical structure to eCSTs, easing the comparison across
language grammars
Universal nodes represent syntactic grammar rules abstracting
programming languages’ concepts
text: while
line: 3
column: 1
clone type: -
Enriched Concrete Syntax Tree (eCST)

19
Stop nodes allow us to fragment the eCST, to compare code
snippets across different tree locations
Enriched Concrete Syntax Tree (eCST)
a
get_bget_a
b
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
b
get_bget_a
a
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body

19
Stop nodes allow us to fragment the eCST, to compare code
snippets across different tree locations
Enriched Concrete Syntax Tree (eCST)
a
get_bget_a
b
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
b
get_bget_a
a
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body

20
Clone detection metric
The first step in the comparison passes through a mapping of universal
nodes, where we state which nodes represent similar concepts in the
languages

20
Clone detection metric
The first step in the comparison passes through a mapping of universal
nodes, where we state which nodes represent similar concepts in the
languages
type
parameter_type
function_type
...

20
Clone detection metric
The first step in the comparison passes through a mapping of universal
nodes, where we state which nodes represent similar concepts in the
languages
type
parameter_type
function_type
...
Second, we compare the tokens’ text
text1text2=
?

21
Clone detection metric
Universal node type Token comparison Clone type

21
Clone detection metric
Universal node type Token comparison Clone type
Not similar Not similar No clone

21
Clone detection metric
Universal node type Token comparison Clone type
Not similar Not similar No clone
Same type Exactly equal Type 1 clone

21
Clone detection metric
Universal node type Token comparison Clone type
Not similar Not similar No clone
Same type Exactly equal Type 1 clone
Same type Not equal Type 2 clone

21
Clone detection metric
Universal node type Token comparison Clone type
Not similar Not similar No clone
Same type Exactly equal Type 1 clone
Same type Not equal Type 2 clone
Similar type Not equal Type 3 clone

21
Clone detection metric
Universal node type Token comparison Clone type
Not similar Not similar No clone
Same type Exactly equal Type 1 clone
Same type Not equal Type 2 clone
Similar type Not equal Type 3 clone
m=3T
1
+2T
2
+T
3
Find the most similar clone pair:

22
Clone detection example
a
get_bget_a
b
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
b
get_bget_a
a
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
fun get_a() {String b = “b” }
fun get_b() {Int a = 1 }
get_a() {int a = 1; }
get_b() {String b = “b”; }

23
Clone detection example
fun get_a() {String b = “b” }
fun get_b() {Int a = 1 }
get_a() {int a = 1; }
get_b() {String b = “b”; }
get_a get_b
get_a
Type 1
m = 3
Type 2
m = 2
get_b
Type 2
m = 2
Type 1
m = 3

24
Clone detection example
a
get_bget_a
b
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
b
get_bget_a
a
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
fun get_a() {String b = “b” }
fun get_b() {Int a = 1 }
get_a() {int a = 1; }
get_b() {String b = “b”; }

25
Clone detection example
a b
b
Type 2
m = 2
Type 1
m = 3
a
Type 1
m = 3
Type 2
m = 2
fun get_a() {String b = “b” }
fun get_b() {Int a = 1 }
get_a() {int a = 1; }
get_b() {String b = “b”; }

26
Clone detection example
a
get_bget_a
b
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
b
get_bget_a
a
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
fun get_a() {String b = “b” }
fun get_b() {Int a = 1 }
get_a() {int a = 1; }
get_b() {String b = “b”; }

26
Clone detection example
a
get_bget_a
b
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
b
get_bget_a
a
File
fun_decl
fun_body
attribute attribute
fun_decl
fun_body
fun get_a() {String b = “b” }
fun get_b() {Int a = 1 }
get_a() {int a = 1; }
get_b() {String b = “b”; }

27
OUT OF STEP at work
1.Language Features
1.Variables
2.Functions
3.Conditionals
4.Loops
5.Classes
2.Sorting Algorithms
3.Mobile Apps
https://flaglab.github.io/CloneDetection/

28
OUT OF STEP at work
•144 applications evaluated (72 Kotlin, 72 Dart)
•12 different application domains
•Medium to large (~ 3200LOC) applications
•Different production levels (novice, intermediate,
enterprise)

29
OUT OF STEP at work
[Out of step: : Code clone detection for mobile apps across different language codebases, SciCo 2024]

Measuring similarity

31
Similarity vs Diversity
int main() {
int a[] = {1, 2, 3, 4, 5};
int n = 5;
int ans = -1;
for (int i = 1; i < n; ++i) {
if (ans==-1 || a[i] >= ans)
ans = a[i];
}
return 0;
}
int main() {
int a[] = {1, 2, 3, 4, 5};
int n = 5, k = 4;
int ans = -1;
for (int i = 0; i < n && ans == -1; ++i) {
if (a[i] == k)
ans = i;
}
return 0;
}
int main() {
int a[] = {1, 2, 3, 4, 5};
int n = 5;
int i = 0;
while (i < n) {
if (i == 0 || a[i - 1] <= a[i])
i++;
else {
int t = a[i-1];
a[i - 1] = a[i];
a[i] = t;
i--;
}
}
return 0;
}

32
Control flow graph extraction
int main() {
int a[] = {1, 2, 3, 4, 5};
int n = 5;
int ans = -1;
for (int i = 1; i < n; ++i) {
if (ans==-1 || a[i] >= ans)
ans = a[i];
}
return 0;
}
CFG
language specific
constructs

33
Comparing CFGs
a = [1,2,3,4,5]
n,k = 5, 4
lo, hi, ans = 0, n-1, -1
for i in range(n):
return 0
if a[i] == k: ans = i
range(n)
a[i] != k
a[i]==k
a = [1,2,3,4,5]
n,ans = 5, -1
for i in range(n):
if ans == -1 or a[i] >= a[ans]
return 0
ans = i
not(ans==-1 or a[i]>=a[ans])
ans==-1 or a[i]>=a[ans]
range(n)

33
Comparing CFGs
a = [1,2,3,4,5]
n,k = 5, 4
lo, hi, ans = 0, n-1, -1
for i in range(n):
return 0
if a[i] == k: ans = i
range(n)
a[i] != k
a[i]==k
a = [1,2,3,4,5]
n,ans = 5, -1
for i in range(n):
if ans == -1 or a[i] >= a[ans]
return 0
ans = i
not(ans==-1 or a[i]>=a[ans])
ans==-1 or a[i]>=a[ans]
range(n)

33
Comparing CFGs
a = [1,2,3,4,5]
n,k = 5, 4
lo, hi, ans = 0, n-1, -1
for i in range(n):
return 0
if a[i] == k: ans = i
range(n)
a[i] != k
a[i]==k
a = [1,2,3,4,5]
n,ans = 5, -1
for i in range(n):
if ans == -1 or a[i] >= a[ans]
return 0
ans = i
not(ans==-1 or a[i]>=a[ans])
ans==-1 or a[i]>=a[ans]
range(n)
a = [1,2,3,4,5]
n,k = 5, 4
lo, hi, ans = 0, n-1, -1
while lo < hi:
mid = (lo + hi) / 2
if k <= a[mid]:
if a[lo] == k:
ans = lo
return 0
hi = mid lo = mid + 1
lo<hi
lo>=hi
a[lo]==k
a[lo]!=k
k>a[mid]
k<=a[mid]

33
Comparing CFGs
a = [1,2,3,4,5]
n,k = 5, 4
lo, hi, ans = 0, n-1, -1
for i in range(n):
return 0
if a[i] == k: ans = i
range(n)
a[i] != k
a[i]==k
a = [1,2,3,4,5]
n,ans = 5, -1
for i in range(n):
if ans == -1 or a[i] >= a[ans]
return 0
ans = i
not(ans==-1 or a[i]>=a[ans])
ans==-1 or a[i]>=a[ans]
range(n)
a = [1,2,3,4,5]
n,k = 5, 4
lo, hi, ans = 0, n-1, -1
while lo < hi:
mid = (lo + hi) / 2
if k <= a[mid]:
if a[lo] == k:
ans = lo
return 0
hi = mid lo = mid + 1
lo<hi
lo>=hi
a[lo]==k
a[lo]!=k
k>a[mid]
k<=a[mid]

33
Comparing CFGs
a = [1,2,3,4,5]
n,k = 5, 4
lo, hi, ans = 0, n-1, -1
for i in range(n):
return 0
if a[i] == k: ans = i
range(n)
a[i] != k
a[i]==k
a = [1,2,3,4,5]
n,ans = 5, -1
for i in range(n):
if ans == -1 or a[i] >= a[ans]
return 0
ans = i
not(ans==-1 or a[i]>=a[ans])
ans==-1 or a[i]>=a[ans]
range(n)
a = [1,2,3,4,5]
n,k = 5, 4
lo, hi, ans = 0, n-1, -1
while lo < hi:
mid = (lo + hi) / 2
if k <= a[mid]:
if a[lo] == k:
ans = lo
return 0
hi = mid lo = mid + 1
lo<hi
lo>=hi
a[lo]==k
a[lo]!=k
k>a[mid]
k<=a[mid]
max
Linear searchBinary search

34
Measuring diversity process
Code artifacts

34
Measuring diversity process
Code artifacts
Intermediate
Representation
(IR)

34
Measuring diversity process
Code artifacts
(g)CFG
Intermediate
Representation
(IR)

34
Measuring diversity process
Code artifacts
(g)CFG
Intermediate
Representation
(IR)
Similarity matching
and
Local Relative
Entropy (LRE)

34
Measuring diversity process
Code artifacts
(g)CFG
Intermediate
Representation
(IR)
Similarity matching
and
Local Relative
Entropy (LRE)
1−
r
ij
max(R)
Similarity score

1
%retval = alloca i32, align 4
%a = alloca [5 x i32], align 16
%n = alloca i32, align 4
%ans = alloca i32, align 4
%i = alloca i32, align 4
store i32 0, i32* %retval
%0 = bitcast [5 x i32]* %a to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([5 x i32]* @_ZZ4mainE1a to i8*), i64 20, i32 16, i1 false)
store i32 5, i32* %n, align 4
%1 = load i32* %n, align 4
%cmp = icmp sgt i32 %1, 0
br i1 %cmp, label %cond.true, label %cond.false
2
%arrayidx = getelementptr inbounds [5 x i32]* %a, i32 0, i64 0
%2 = load i32* %arrayidx, align 4
br label %cond.end
3
br label %cond.end
4
%cond = phi i32 [ %2, %cond.true ], [ 0, %cond.false ]
store i32 %cond, i32* %ans, align 4
store i32 1, i32* %i, align 4
br label %for.cond
5
%3 = load i32* %i, align 4
%4 = load i32* %n, align 4
%cmp1 = icmp slt i32 %3, %4
br i1 %cmp1, label %for.body, label %for.end
6
%5 = load i32* %i, align 4
%idxprom = sext i32 %5 to i64
%arrayidx2 = getelementptr inbounds [5 x i32]* %a, i32 0, i64 %idxprom
%6 = load i32* %arrayidx2, align 4
%7 = load i32* %ans, align 4
%cmp3 = icmp sgt i32 %6, %7
br i1 %cmp3, label %if.then, label %if.end
7
%8 = load i32* %i, align 4
%idxprom4 = sext i32 %8 to i64
%arrayidx5 = getelementptr inbounds [5 x i32]* %a, i32 0, i64 %idxprom4
%9 = load i32* %arrayidx5, align 4
store i32 %9, i32* %ans, align 4
br label %if.end
8
%10 = load i32* %i, align 4
%inc = add nsw i32 %10, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
9
ret i32 0
35
Control flow graph extraction
int main() {
int a[] = {1, 2, 3, 4, 5};
int n = 5;
int ans = -1;
for (int i = 1; i < n; ++i) {
if (ans==-1 || a[i] >= ans)
ans = a[i];
}
return 0;
}
LLVM
9
7
8
5
4
1
2
3
6
GPS
CFG

36
Similarity between nodes
9
7
8
5
4
1
2
3
6
7
8
5
4
1
2
3
6
in-nodes of i’s and j’s neighbors
[An Structural Approach to Program Similarity Analysis, SQAMIA’24]
out-nodes of i’s and j’s neighbors
x
k+1
ij
=
s
k+1
in(i,j)+s
k+1
out(i,j)
2

36
Similarity between nodes
9
7
8
5
4
1
2
3
6
max
Linear search
7
8
5
4
1
2
3
6
in-nodes of i’s and j’s neighbors
[An Structural Approach to Program Similarity Analysis, SQAMIA’24]
out-nodes of i’s and j’s neighbors
x
k+1
ij
=
s
k+1
in(i,j)+s
k+1
out(i,j)
2

37
Similarity between nodes
9
7
8
5
4
1
2
3
6
max
Linear search
7
8
5
4
1
2
3
6
[An Structural Approach to Program Similarity Analysis, SQAMIA’24]
4
max,ls
=
1+
1
2
2
=0.75
5
max,ls
=
1+1
2
=1

37
Similarity between nodes
9
7
8
5
4
1
2
3
6
max
Linear search
7
8
5
4
1
2
3
6
[An Structural Approach to Program Similarity Analysis, SQAMIA’24]
4
max,ls
=
1+
1
2
2
=0.75
5
max,ls
=
1+1
2
=1

38
Local relative entropy - LRE
9
7
8
5
4
1
2
3
6
max
Linear search
7
8
5
4
1
2
3
6

38
Local relative entropy - LRE
9
7
8
5
4
1
2
3
6
max
Linear search
7
8
5
4
1
2
3
6

38
Local relative entropy - LRE
9
7
8
5
4
1
2
3
6
max
Linear search
Binary search
7
8
5
4
1
2
3
6
9
7
8
5
41
2
3
6
Linear search
7
8
5
4
1
2
3
6

38
Local relative entropy - LRE
9
7
8
5
4
1
2
3
6
max
Linear search
Binary search
7
8
5
4
1
2
3
6
9
7
8
5
41
2
3
6
Linear search
7
8
5
4
1
2
3
6

38
Local relative entropy - LRE
9
7
8
5
4
1
2
3
6
max
Linear search
Binary search
7
8
5
4
1
2
3
6
9
7
8
5
41
2
3
6
Linear search
7
8
5
4
1
2
3
6

39
Measuring similarity
Built a corpus of 5 programs from programming contest platforms
with 566 implementations between them
Is there similarity or diversity among the implementations to a
program?

40
558B

41
All problems

Tracking differences
Created by PixelX
from Noun Project

43
Divergence
APP

43
Divergence
APP

43
Divergence
APP
FEATURE DIVERGENCE

44
Divergence
var sum = 0;
for (i in 1..100) {
sum = sum + i
}
int sum = 0;
int i = 1;
while(i<=100) {
sum = sum + i;
i = i + 1;
}

45
Divergence
for=
FUN
ASSIGNMENT
EXP
LOOP
BODY
sum +
sumLITTERAL BODY
sum 0
+
i
for
FUN
ASSIGNMENT
EXP
LOOP
BODY
sum i
sumLITTERAL BODY
int sum 0i
VAR
= sum 1
ASSIGNMENT
i
+ EXP

46
Divergence
var sum = 0;
for (i in 1..100) {
sum = sum + i
}
int sum = 0;
int i = 1;
while(i<=100) {
if(i%2 == 0)
sum = sum + i;
i = i + 1;
}

47
Detecting divergence
+
i
for
FUN
ASSIGNMENT
CONDITION
LOOP
BODY
sum
i
sumLITTERAL BODY
int sum 0
ASSIGNMENT
i
+ EXP
EXP
i % 2 …

48
Managing divergence
+
CONDITION
sum
EXP
i % 2 …
+
CONDITION
sum
EXP
i % 2 …
MULT_OP ADD_OP

48
Managing divergence
+
CONDITION
sum
EXP
i % 2 …
+
CONDITION
sum
EXP
i % 2 …
MULT_OP ADD_OP
if(i%2==0)
sum = sum + i
L4 - L4 ++→

return 0;

50
Closing remarks
Systems are becoming more
polyglot and we need to create the
tools to manage and analyze them

50
Closing remarks
Abstractions are
key to associate
elements from
different
programming
languages, going
over their
syntactic
meaning.
Systems are becoming more
polyglot and we need to create the
tools to manage and analyze them

50
Closing remarks
Abstractions are
key to associate
elements from
different
programming
languages, going
over their
syntactic
meaning.
Systems are becoming more
polyglot and we need to create the
tools to manage and analyze them
Abstracting over language
features it is possible to
detect similarities,
differences, and move
between language
implementations

FLAGL BA
AI, RL, ML,
Ontologies,
NLP
Algorithms
and Data
Structures
Bioinformatic
software and
algorithms
Programming
Language
Implementations
(post-quantum)
cryptography

@FLAGlab research