390 likes | 521 Views
MeCC : Memory Comparison based Clone Detector. Heejung Kim1, Yungbum Jung1, Sunghun Kim2, and Kwangkeun Yi1 1 Seoul National University 2 The Hong Kong University of Science and Technology By Choi Yong suk. Code Clones. static PyObject * float_ mul (PyObject *v, PyObject *w) { double a,b;
E N D
MeCC: Memory Comparison based Clone Detector Heejung Kim1, Yungbum Jung1, Sunghun Kim2, and Kwangkeun Yi1 1 Seoul National University 2 The Hong Kong University of Science and Technology By Choi Yong suk
Code Clones static PyObject * float_mul(PyObject *v, PyObject *w) { double a,b; CONVERT_TO_DOUBLE(v,a); CONVERT_TO_DOUBLE(w,b); PyFPE_START_PROTECT(“multiply”,return 0) a = a * b; PyFPE_END_PROTECT(a) return PyFloat_FromDouble(a); } static PyObject * float_add(PyObject *v, PyObject *w) { double a,b; CONVERT_TO_DOUBLE(v,a); CONVERT_TO_DOUBLE(w,b); PyFPE_START_PROTECT(“add”,return 0) a = a + b; PyFPE_END_PROTECT(a) return PyFloat_FromDouble(a); } • similar code fragments • (syntactically or semantically)
Applications of Code Clones analysis software refactoring detecting potential bugs understanding software evolution detecting software plagiarism (malicious duplication)
Clone Detectors • CCFinder [TSE’02] • textual tokens • DECKARD [ICSE’07] • AST characteristic vectors • PDG-based [ICSE‘08, SAS’01] • program dependence graph - Effective for syntactic code clones - limited for semantic code clones
Three code clones missed by syntax-based clone detection
Control Replacement (1) PyObject *PyBool_FromLong (long ok) { PyObject *result; if (ok) result = Py_True; else result = Py_False; Py_INCREF(result); return result; } static PyObject *get_pybool (int istrue) { PyObject *result = istrue? Py_True: Py_False; Py_INCREF(result); return result; } syntactically different but semantically identical
Capturing Procedural Effects (2) void appendPQExpBufferChar (PQExpBuffer str, char ch) { /* Make more room if needed *. if (!enlargePQExpBuffer(str, 1)) return; /* OK, append the data */ str->data[str->len] = ch; str->len++; str->data[str->len] = ‘\0’; } void appendBinaryPQExpBuffer (PQExpBuffer str, const char* data, size_t datalen) { /* Make more room if needed *. if (!enlargePQExpBuffer(str, datalen)) return; /* OK, append the data */ memcpy(str->data + str->len, data, datalen); str->len+= datalen; str->data[str->len] = ‘\0’; } understanding memory behavior of procedures
More Complex Clone (3) ... *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; }
… *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; } statement reordering
… *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } intermediate variables ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; } statement reordering
… *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } intermediate variables statement splitting ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; } statement reordering
… *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf = ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) { return err; } conf->access_name = apr_pstrdup(cmd->pool,arg); return NULL; } intermediate variables statement splitting ... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf = ap_get_module_config(cmd->server->module_config, &core_module); char *proto; if (err != NULL) { return err; } proto = apr_pstrdup(cmd->pool,arg); ap_str_tolower(proto); conf->protocol = proto; return NULL; } statement reordering
These Semantic Clones are Identified by MeCC
MeCC: Approach Static analyzer estimates the semantics of programs Abstract memories are results of analysis Comparing abstract memories is a measure
Clone Detection Process procedures abstract memories Static Analyzer program Comparing Memories Code Clones similarities
Estimating Semantics by Abstract Memories (guarded value ) int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } {(guard , symbolic value)} (finite mapping) Estimating an abstract memory at the procedure’s exit point Abstract memory is a map from abstract addresses to abstract values
Estimating Semantics by Abstract Memories int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } • The abstract memory state • All abstract values are guarded by execution path conditions Use symbols for unknown input values
Estimating Semantics by Abstract Memories int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } • The abstract memory state • All abstract values are guarded by execution path conditions Use symbols for unknown input values
Estimating Semantics by Abstract Memories int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r; } copy and modify int make2 (list2 *a, int b){ if (a==0) return b; a->n = malloc(...); a->n->v = b; return b + 2; }
Clone Detection Process procedures abstract memories Static Analyzer program Comparing Memories Code Clones similarities
Subject Projects Table 5: Time spent for the detection process.
CLONE TYPES Type-1 (Exact clones): Identical code fragments except for variations in whitespace, layout, and comments. Type-2 (Renamed clones): Syntactically identical fragments except for variations in identiers, literals, and variable types in addition to Type-1's variations. Type-3 (Gapped clones): Copied fragments with further modications such as changed, added, or deleted statements in addition to Type-2's variations. Type-4 (Semantic clones): Code fragments that perform similar functionality but are implemented by different syntactic variants.
Detected Clones & Semantic Clones 45% !! C. K. Roy and J. R. Cordy. A survey on software clone detection research. SCHOOL OF COMPUTING TR 2007-541, QUEEN’S UNIVERSITY, 115, 2007. Table 2: The distribution of detected clone types by MeCC.
Comparison Table 6: The numbers of detected Type-3 and Type-4 clones by MeCC, Deckard, CCFinder, and a PDG-based detector [9].
Applications of Code Clones analysis software refactoring detecting potential bugs understanding software evolution detecting software plagiarism (malicious duplication)
Finding Potential Bugs A large portion of semantic clones are due to inconsistent changes Inconsistent changes may lead to potential bugs (inconsistent clones) Two semantic clones with potential bugs
const char *GetVariable (VariableSpace space, const char *name) { struct_variable *current; if (!space) return NULL; for (current=space->next;current;current=current->next) { if (strcmp(current->name,name) == 0) { return current->value; } } return NULL; } const char *PQparameterStatus (const PGconn *conn, const char *paramName) { const pgParameterStatus *pstatus; if (!conn || !paramName) return NULL; for (pstatus=conn->pstatus; pstatus!=NULL; pstatus = pstatus->next) { if (strcmp(pstatus->name,paramName)== 0) return pstatus->value; } return NULL; } parameter name also should be checked! Missed Null Check
PyObject *pwd_getpwall (PyObject *self) { PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); return NULL; } Py_DECREF(v); } endpwent(); return d; } open user database close user database A resource leak without endpwent() procedure call A Resource Leak Bug (Python project revision #20157)
PyObject *spwd_getspall (PyObject *self, PyObject *args) { PyObject *d; struct spwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setspent(); while ((p = getspent()) != NULL) { PyObject *v = mkspent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); endspent(); return NULL; } Py_DECREF(v); } endspent(); return d; } PyObject *pwd_getpwall (PyObject *self) { PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); return NULL; } Py_DECREF(v); } endpwent(); return d; } A Bug-free Procedure (Python project revision #38359)
PyObject *spwd_getspall (PyObject *self, PyObject *args) { PyObject *d; struct spwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setspent(); while ((p = getspent()) != NULL) { PyObject *v = mkspent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); endspent(); return NULL; } Py_DECREF(v); } endspent(); return d; } PyObject *pwd_getpwall (PyObject *self) { PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); endpwent(); return NULL; } Py_DECREF(v); } endpwent(); return d; } Bug fixed The Bug is Fixed Later(Python project revision #73017)
Procedure A was created with a resource leak Procedure B (a code clone of A) is introduced without resource leaks The resource leak bug in procedure A is fixed revision #20157 revision #38359 4 years the resource leak can be fixed if MeCC were applied revision #73017
Potential Bugs and Code Smells #Semantic Clones Potential Bugs (%) Code Smells (%) detected by MeCC Table 7: Exploitable bugs and code smells in Type-3 and Type-4 clones found by MeCC.
Conclusion MeCC: Memory Comparison-based Clone Detector a new clone detector using semantics-based static analysis tolerant to syntactic variations can be used to find potential bugs