Skip to content

Commit 66caf83

Browse files
authored
Removed external library and added clustering algorithm (#532)
* Added basic interfaces * Added an initial draft * Added an initial draft * Fixed a few bugs * Added KDocs * Added KDocs * Fixed a few bugs * Fixed a few docs * Formatted code * Formatted code
1 parent 87283e3 commit 66caf83

File tree

11 files changed

+447
-24
lines changed

11 files changed

+447
-24
lines changed

utbot-summary/build.gradle

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@ dependencies {
55
api project(':utbot-framework-api')
66
compile(project(':utbot-instrumentation'))
77

8-
implementation group: 'com.github.haifengl', name: 'smile-kotlin', version: '2.6.0'
9-
implementation group: 'com.github.haifengl', name: 'smile-core', version: '2.6.0'
10-
118
implementation group: 'io.github.microutils', name: 'kotlin-logging', version: kotlin_logging_version
129

1310
implementation group: 'com.github.javaparser', name: 'javaparser-core', version: '3.22.1'
11+
12+
testImplementation("org.junit.jupiter:junit-jupiter:$junit5_version")
1413
}

utbot-summary/src/main/kotlin/org/utbot/summary/UtSummarySettings.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ object UtSummarySettings {
4848
* DBSCAN hyperparameter
4949
* Sets radius of search for algorithm
5050
*/
51-
var RADIUS_DBSCAN: Double = 5.0
51+
var RADIUS_DBSCAN: Float = 5.0f
5252
}
5353

5454
object SummarySentenceConstants {

utbot-summary/src/main/kotlin/org/utbot/summary/clustering/ExecutionDistance.kt renamed to utbot-summary/src/main/kotlin/org/utbot/summary/clustering/ExecutionMetric.kt

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
package org.utbot.summary.clustering
22

33
import org.utbot.framework.plugin.api.Step
4-
import smile.math.distance.Distance
5-
6-
class ExecutionDistance : Distance<Iterable<Step>> {
7-
override fun d(x: Iterable<Step>, y: Iterable<Step>): Double {
8-
return compareTwoPaths(x, y)
9-
}
4+
import org.utbot.summary.clustering.dbscan.Metric
105

6+
/** The existing implementation of [Metric] for the space of [Step]. */
7+
class ExecutionMetric : Metric<Iterable<Step>> {
118
/**
129
* Minimum Edit Distance
1310
*/
@@ -19,7 +16,7 @@ class ExecutionDistance : Distance<Iterable<Step>> {
1916
val stmt1 = path1.elementAt(i)
2017
val stmt2 = path2.elementAt(j)
2118

22-
val d1 = distances[i - 1][j] + 1 //path 1 insert -> diff stmt from path2
19+
val d1 = distances[i - 1][j] + 1 // path 1 insert -> diff stmt from path2
2320
val d2 = distances[i][j - 1] + 1 // path 2 insert -> diff stmt from path1
2421
val d3 = distances[i - 1][j - 1] + distance(stmt1, stmt2) // aligned or diff
2522
distances[i][j] = minOf(d1, d2, d3)
@@ -31,4 +28,8 @@ class ExecutionDistance : Distance<Iterable<Step>> {
3128
private fun distance(stmt1: Step, stmt2: Step): Int {
3229
return if (stmt1 == stmt2) 0 else 2
3330
}
31+
32+
override fun compute(object1: Iterable<Step>, object2: Iterable<Step>): Double {
33+
return compareTwoPaths(object1, object2)
34+
}
3435
}

utbot-summary/src/main/kotlin/org/utbot/summary/clustering/MatrixUniqueness.kt

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ package org.utbot.summary.clustering
33
import org.utbot.framework.plugin.api.Step
44
import org.utbot.framework.plugin.api.UtExecution
55
import org.utbot.summary.UtSummarySettings
6-
import smile.clustering.dbscan
6+
import org.utbot.summary.clustering.dbscan.DBSCANTrainer
7+
import org.utbot.summary.clustering.dbscan.neighbor.LinearRangeQuery
78

89
class MatrixUniqueness(executions: List<UtExecution>) {
910

@@ -21,7 +22,10 @@ class MatrixUniqueness(executions: List<UtExecution>) {
2122
}
2223

2324
/**
24-
* Creates uniquness matrix. Rows are executions, columns are unique steps from all executions
25+
* Creates uniqueness matrix.
26+
*
27+
* Rows are executions, columns are unique steps from all executions
28+
*
2529
* Every matrix i,j is 1 or 0, as if step in execution or not.
2630
*/
2731
private fun createMatrix(): List<IntArray> {
@@ -49,10 +53,10 @@ class MatrixUniqueness(executions: List<UtExecution>) {
4953
private fun colSums(matrix: List<IntArray>) = matrix.first().indices.map { col -> this.colSum(matrix, col) }
5054

5155
/**
52-
* Splits all steps into common, partly common and unique
56+
* Splits all steps into common, partly common and unique.
5357
*
54-
* Unique steps are steps that only occur in one execution
55-
* Common steps are steps that occur in all executions
58+
* Unique steps are steps that only occur in one execution.
59+
* Common steps are steps that occur in all executions.
5660
* Partly common steps are steps that occur more than one time, but not in all executions
5761
*/
5862
fun splitSteps(): SplitSteps {
@@ -74,19 +78,24 @@ class MatrixUniqueness(executions: List<UtExecution>) {
7478
}
7579

7680
companion object {
77-
/**
78-
* Returns map: cluster identifier, List<executions>
79-
* DBSCAN - Density-Based Spatial Clustering of Applications with Noise
80-
* Finds core samples of high density and expands clusters from them
81-
*/
81+
/** Returns map: cluster identifier, List<executions>. */
8282
fun dbscanClusterExecutions(
8383
methodExecutions: List<UtExecution>,
8484
minPts: Int = UtSummarySettings.MIN_EXEC_DBSCAN,
85-
radius: Double = UtSummarySettings.RADIUS_DBSCAN
85+
radius: Float = UtSummarySettings.RADIUS_DBSCAN
8686
): Map<Int, List<UtExecution>> {
87+
8788
val executionPaths = methodExecutions.map { it.path.asIterable() }.toTypedArray()
88-
val cluster = dbscan(executionPaths, ExecutionDistance(), minPts, radius)
89-
return methodExecutions.withIndex().groupBy({ cluster.y[it.index] }, { it.value })
89+
90+
val dbscan = DBSCANTrainer(
91+
eps = radius,
92+
minSamples = minPts,
93+
metric = ExecutionMetric(),
94+
rangeQuery = LinearRangeQuery()
95+
)
96+
val dbscanModel = dbscan.fit(executionPaths)
97+
val clusterLabels = dbscanModel.clusterLabels
98+
return methodExecutions.withIndex().groupBy({ clusterLabels[it.index] }, { it.value })
9099
}
91100
}
92101
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package org.utbot.summary.clustering.dbscan
2+
3+
/**
4+
* Keeps the information about clusters produced by [DBSCANTrainer].
5+
*
6+
* @property [numberOfClusters] Number of clusters.
7+
* @property [clusterLabels] It contains labels of clusters in the range ```[0; k)```
8+
* or [Int.MIN_VALUE] if point could not be assigned to any cluster.
9+
*/
10+
data class DBSCANModel(
11+
val numberOfClusters: Int = 0,
12+
val clusterLabels: IntArray
13+
)
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
package org.utbot.summary.clustering.dbscan
2+
3+
import org.utbot.summary.clustering.dbscan.neighbor.LinearRangeQuery
4+
import org.utbot.summary.clustering.dbscan.neighbor.Neighbor
5+
import org.utbot.summary.clustering.dbscan.neighbor.RangeQuery
6+
7+
private const val NOISE = Int.MIN_VALUE
8+
private const val CLUSTER_PART = -2
9+
private const val UNDEFINED = -1
10+
11+
/**
12+
* DBSCAN algorithm implementation.
13+
*
14+
* NOTE: The existing implementation with the [LinearRangeQuery] has a complexity O(n^2) in the worst case.
15+
*
16+
* @property [eps] The radius of search. Should be more than 0.0.
17+
* @property [minSamples] The minimum number of samples to form the cluster. Should be more than 0.
18+
* @property [metric] Metric to calculate distances.
19+
* @property [rangeQuery] Gives access to the data in the implemented order.
20+
*
21+
* @see <a href="https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf">
22+
* A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
23+
*/
24+
class DBSCANTrainer<T>(val eps: Float, val minSamples: Int, val metric: Metric<T>, val rangeQuery: RangeQuery<T>) {
25+
init {
26+
require(minSamples > 0) { "MinSamples parameter should be more than 0: $minSamples" }
27+
require(eps > 0.0f) { "Eps parameter should be more than 0: $eps" }
28+
}
29+
30+
/** Builds a clustering model based on the given data. */
31+
fun fit(data: Array<T>): DBSCANModel {
32+
require(data.isNotEmpty()) { "Nothing to learn, data is empty." }
33+
34+
if (rangeQuery is LinearRangeQuery) {
35+
rangeQuery.data = data
36+
rangeQuery.metric = metric
37+
} // TODO: could be refactored if we add some new variants of RangeQuery
38+
39+
val labels = IntArray(data.size) { UNDEFINED }
40+
41+
// It changes in the range [0; k), where k is a final number of clusters found by DBSCAN
42+
var clusterLabel = 0
43+
44+
for (i in data.indices) {
45+
if (labels[i] == UNDEFINED) {
46+
val neighbors = rangeQuery.findNeighbors(data[i], eps).toMutableList()
47+
if (neighbors.size < minSamples) {
48+
labels[i] = NOISE
49+
} else {
50+
labels[i] = clusterLabel
51+
expandCluster(neighbors, labels, clusterLabel)
52+
53+
// If the existing cluster can not be expanded, the cluster label is incremented.
54+
clusterLabel++
55+
}
56+
}
57+
}
58+
59+
return DBSCANModel(numberOfClusters = clusterLabel, clusterLabels = labels)
60+
}
61+
62+
private fun expandCluster(
63+
neighbors: MutableList<Neighbor<T>>,
64+
labels: IntArray,
65+
k: Int
66+
) {
67+
// Neighbors to expand.
68+
neighbors.forEach {
69+
if (labels[it.index] == UNDEFINED) {
70+
// All neighbors of a cluster point became cluster points.
71+
labels[it.index] = CLUSTER_PART
72+
}
73+
}
74+
75+
// NOTE: the size of neighbors could grow from iteration to iteration and the classical for-loop in Kotlin could not be used
76+
var j = 0
77+
78+
// Process every seed point Q.
79+
while (j < neighbors.count())
80+
{
81+
val q = neighbors[j]
82+
val idx = q.index
83+
84+
// Change Noise to border point.
85+
if (labels[idx] == NOISE) {
86+
labels[idx] = k
87+
}
88+
89+
if (labels[idx] == UNDEFINED || labels[idx] == CLUSTER_PART) {
90+
labels[idx] = k
91+
92+
val qNeighbors = rangeQuery.findNeighbors(q.key, eps)
93+
94+
if (qNeighbors.size >= minSamples) {
95+
mergeTwoGroupsInCluster(qNeighbors, labels, neighbors)
96+
}
97+
}
98+
j++
99+
}
100+
}
101+
102+
private fun mergeTwoGroupsInCluster(
103+
qNeighbors: List<Neighbor<T>>,
104+
labels: IntArray,
105+
neighbors: MutableList<Neighbor<T>>
106+
) {
107+
for (qNeighbor in qNeighbors) {
108+
val label = labels[qNeighbor.index]
109+
if (label == UNDEFINED) {
110+
labels[qNeighbor.index] = CLUSTER_PART
111+
}
112+
113+
if (label == UNDEFINED || label == NOISE) {
114+
neighbors.add(qNeighbor)
115+
}
116+
}
117+
}
118+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package org.utbot.summary.clustering.dbscan
2+
3+
interface Metric<T> {
4+
/** Computes the distance between [object1] and [object2] according the given metric. */
5+
fun compute(object1: T, object2: T): Double
6+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package org.utbot.summary.clustering.dbscan.neighbor
2+
3+
import org.utbot.summary.clustering.dbscan.Metric
4+
5+
/**
6+
* This approach implements brute-force search with complexity O(n).
7+
*
8+
* @property [data] The whole dataset to search in it.
9+
* @property [metric] Metric.
10+
*/
11+
class LinearRangeQuery<K> : RangeQuery<K> {
12+
lateinit var data: Array<K>
13+
lateinit var metric: Metric<K>
14+
15+
override fun findNeighbors(queryKey: K, radius: Float): List<Neighbor<K>> {
16+
val neighbors = mutableListOf<Neighbor<K>>()
17+
data.forEachIndexed { index, point ->
18+
val distance = metric.compute(queryKey, point)
19+
if (distance <= radius && queryKey != point) {
20+
neighbors.add(Neighbor(point, index, distance))
21+
}
22+
}
23+
24+
return neighbors
25+
}
26+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package org.utbot.summary.clustering.dbscan.neighbor
2+
3+
/**
4+
* Neighbor abstraction for algorithms with searching in metric space specialization.
5+
*
6+
* @property [key] Search key.
7+
* @property [index] Direct index to access the point in the basic data structure that keeps a set of points.
8+
* @property [distance] Numerical value that keeps distance from the [key] point in the chosen metric space.
9+
*
10+
* NOTE: Neighbors should be ordered and this is implemented via [Comparable] interface.
11+
*/
12+
class Neighbor<K>(val key: K, val index: Int, private val distance: Double) : Comparable<Neighbor<K>> {
13+
override fun compareTo(other: Neighbor<K>): Int {
14+
val distance = distance.compareTo(other.distance)
15+
return if (distance == 0) index.compareTo(other.index) else distance
16+
}
17+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package org.utbot.summary.clustering.dbscan.neighbor
2+
3+
/** This is a basic interface for our approaches to ask the set of all points return the subset of the closest neighbors. */
4+
interface RangeQuery<K> {
5+
/** Returns the list of the closest neighbors in the [radius] from the [queryKey]. */
6+
fun findNeighbors(queryKey: K, radius: Float): List<Neighbor<K>>
7+
}

0 commit comments

Comments
 (0)