Compare commits

...

10 Commits

Author SHA1 Message Date
Claudio Maggioni 7253cf8590 Added commit id in report 2023-11-08 22:31:16 +01:00
Claudio Maggioni b8e0a2c3c4 added name 2023-11-08 22:25:13 +01:00
Claudio Maggioni f374d2eeb5 Models fixed 2023-11-08 22:11:43 +01:00
Claudio Maggioni a288957112 report done 2023-11-07 15:07:15 +01:00
Claudio Maggioni ccda6c1c09 Report section 1 and 2 done 2023-11-07 12:35:27 +01:00
Claudio Maggioni dab12ddca7 wip report 2023-11-07 11:48:00 +01:00
Claudio Maggioni 9d8dd05428 done part 4 2023-10-25 15:45:04 +02:00
Claudio Maggioni 297f20d85e almost done part 4 2023-10-25 15:10:47 +02:00
Claudio Maggioni c644888371 part 3 done 2023-10-23 15:42:25 +02:00
Claudio Maggioni 8de7663a8a doc2vec executes 2023-10-16 16:36:25 +02:00
15 changed files with 1164 additions and 58 deletions

458
.gitignore vendored
View File

@ -1 +1,459 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
**/.DS_Store
out/model/*.pt
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb
## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
# *.pdf
## Generated if empty string is given at "Please type another file name for output:"
## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml
## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync
## Build tool directories for auxiliary files
# latexrun
latex.out/
## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa
# achemso
acs-*.bib
# amsthm
*.thm
# beamer
*.nav
*.pre
*.snm
*.vrb
# changes
*.soc
# comment
*.cut
# cprotect
*.cpt
# elsarticle (documentclass of Elsevier journals)
*.spl
# endnotes
*.ent
*.lox
# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm
#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R
# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs
*.slg
*.slo
*.sls
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist
# gnuplot
*.gnuplot
*.table
# gnuplottex
*-gnuplottex-*
# gregoriotex
*.gaux
*.glog
*.gtex
# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref
# hyperref
*.brf
# knitr
*-concordance.tex
# *.tikz
*-tikzDictionary
# listings
*.lol
# luatexja-ruby
*.ltjruby
# makeidx
*.idx
*.ilg
*.ind
# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*
# minted
_minted*
*.pyg
# morewrites
*.mw
# newpax
*.newpax
# nomencl
*.nlg
*.nlo
*.nls
# pax
*.pax
# pdfpcnotes
*.pdfpc
# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd
# scrwfile
*.wrt
# svg
svg-inkscape/
# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/
# pdfcomment
*.upa
*.upb
# pythontex
*.pytxcode
pythontex-files-*/
# tcolorbox
*.listing
# thmtools
*.loe
# TikZ & PGF
*.dpth
*.md5
*.auxlock
# titletoc
*.ptc
# todonotes
*.tdo
# vhistory
*.hst
*.ver
*.lod
# xcolor
*.xcp
# xmpincl
*.xmpi
# xindy
*.xdy
# xypic precompiled matrices and outlines
*.xyc
*.xyd
# endfloat
*.ttt
*.fff
# Latexian
TSWLatexianTemp*
## Editors:
# WinEdt
*.bak
*.sav
# Texpad
.texpadtmp
# LyX
*.lyx~
# Kile
*.backup
# gummi
.*.swp
# KBibTeX
*~[0-9]*
# TeXnicCenter
*.tps
# auto folder when using emacs and auctex
./auto/*
*.el
# expex forward references with \gathertags
*-tags.tex
# standalone packages
*.sta
# Makeindex log files
*.lpz
# xwatermark package
*.xwm
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib

View File

@ -1,14 +1,85 @@
# Project 02: Multi-source code search
**Claudio Maggioni**
### About the Project
This project has the goal of developing a search engine able to query a large Python code repository using multiple sources of information.
This project has the goal of developing a search engine able to query a large Python code repository using multiple
sources of information.
It is part of the Knowledge Analysis & Management - 2022 course from the Università della Svizzera italiana.
In this repository, you can find the following files:
- tensor flow: a code repository to be used during this project
- ground-truth-unique: a file containing the references triples necessary to evaluate the search engine (step 3)
- ground-truth-unique: a file containing the references triples necessary to evaluate the search engine (step 3)
For more information, see the Project-02 slides (available on iCourse)
Note: Feel free to modify this file according to the project's necessities.
## Environment setup
To install the required dependencies make sure `python3` points to a Python 3.10 or 3.11 installation and then run:
```shell
python3 -m venv env
source env/bin/activate
pip install -r requirements.txt
```
## Part 1: data extraction
To extract the data in file `data.csv` run the command:
```shell
python3 extract-data.py
```
The script prints the requested counts, which are namely:
```
Methods: 5817
Functions: 4565
Classes: 1882
Python Files: 2817
```
## Part 2: Training
In order to train and predict the output of a given query run the command:
```shell
python3 search-data.py [method] "[query]"
```
where `[method]` is one of `{tfidf,freq,lsi,doc2vec}` or `all` to run all classifiers and `[query]` is the natural
language query to search. Outputs are printed on stdout, and in case of `doc2vec` the trained model file is saved in
`./doc2vec_model.dat` and fetched in this path for subsequent executions.
## Part 3: Evaluation
To evaluate a model run the command:
```shell
python3 search-data.py [method] ./ground-truth-unique.txt
```
where `[method]` is one of `{tfidf,freq,lsi,doc2vec}` or `all` to evaluate all classifiers. The script outputs the
performance of the classifiers in terms of average precision and recall, which are namely:
| Engine | Average Precision | Average Recall |
|:---------|:--------------------|:-----------------|
| tfidf | 90.00% | 90.00% |
| freq | 93.33% | 100.00% |
| lsi | 90.00% | 90.00% |
| doc2vec | 73.33% | 80.00% |
## Report
To compile the report run:
```shell
cd report
pdflatex -interaction=nonstopmode -output-directory=. main.tex
pdflatex -interaction=nonstopmode -output-directory=. main.tex
```

BIN
doc2vec_model.dat Normal file

Binary file not shown.

View File

@ -15,7 +15,7 @@ def find_py_files(dir):
def keep_name(name):
return not name.startswith("_") and not "main" in str(name).lower() and \
return not name.startswith("_") and "main" not in str(name).lower() and \
"test" not in str(name).lower()
@ -28,7 +28,7 @@ class FeatureVisitor(ast.NodeVisitor):
def visit_FunctionDef(self, node):
if keep_name(node.name):
self.rows.append({
"name": node.name,
"name": node.name,
"file": self.filename,
"line": node.lineno,
"type": "function",
@ -56,14 +56,14 @@ class FeatureVisitor(ast.NodeVisitor):
})
def main():
df = pd.DataFrame(columns=["name", "file", "line", "type", "comment"])
for file in find_py_files(IN_DIR):
files = list(find_py_files(IN_DIR))
for file in files:
with open(file, "r") as f:
py_source = f.read()
py_ast = ast.parse(py_source)
visitor = FeatureVisitor(file)
@ -71,6 +71,16 @@ def main():
df_visitor = pd.DataFrame.from_records(visitor.rows)
df = pd.concat([df, df_visitor])
counts = df["type"].apply(lambda ft: {
"function": "Functions",
"class": "Classes",
"method": "Methods"
}[ft]).value_counts().to_dict()
counts["Python Files"] = len(files)
for file_type, name in counts.items():
print(f"{file_type}: {name}")
df.reset_index(drop=True).to_csv(OUT_FILE)

BIN
out/doc2vec_plot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

View File

@ -0,0 +1,2 @@
Precision: 73.33%
Recall: 80.00%

2
out/freq_prec_recall.txt Normal file
View File

@ -0,0 +1,2 @@
Precision: 93.33%
Recall: 100.00%

BIN
out/lsi_plot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

2
out/lsi_prec_recall.txt Normal file
View File

@ -0,0 +1,2 @@
Precision: 90.00%
Recall: 90.00%

View File

@ -0,0 +1,2 @@
Precision: 90.00%
Recall: 90.00%

149
prec-recall.py Normal file
View File

@ -0,0 +1,149 @@
import argparse
import os.path
from typing import Iterable, Optional
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
search_data = __import__('search-data')
TENSORFLOW_PATH_PREFIX: str = "./"
OUT_DIR: str = os.path.join(os.path.dirname(__file__), "out")
def read_ground_truth(file_path: str, df: pd.DataFrame) -> Iterable[tuple[str, int]]:
records: list[list[str]] = []
with open(file_path) as f:
record_tmp = []
for line in f:
line = line.strip()
if line == '':
assert len(record_tmp) == 3
records.append(record_tmp)
record_tmp = []
else:
record_tmp.append(line)
if len(record_tmp) == 3:
records.append(record_tmp)
for query, name, file_name in records:
assert file_name.startswith(TENSORFLOW_PATH_PREFIX)
file_name = file_name[len(TENSORFLOW_PATH_PREFIX):]
row = df[(df.name == name) & (df.file == file_name)]
assert len(row) == 1
yield query, row.index[0]
def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
for i, le in enumerate(li):
if le[0] == e:
return i
return None
def plot_df(results, query: str) -> Optional[pd.DataFrame]:
if results.vectors is not None and results.query_vector is not None:
tsne_vectors = np.array(results.vectors + [results.query_vector])
tsne = TSNE(n_components=2, perplexity=2, n_iter=3000)
tsne_results = tsne.fit_transform(tsne_vectors)
df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'Query', 'Vector kind'])
df['tsne-2d-one'] = tsne_results[:, 0]
df['tsne-2d-two'] = tsne_results[:, 1]
df['Query'] = [query] * (len(results.vectors) + 1)
df['Vector kind'] = (['Result'] * len(results.vectors)) + ['Input query']
return df
else:
return None
def evaluate(method_name: str, file_path: str) -> tuple[float, float]:
df = search_data.load_data()
test_set = list(read_ground_truth(file_path, df))
precision_sum = 0
recall_sum = 0
dfs = []
for query, expected in tqdm.tqdm(test_set):
search_results = search_data.search(query, method_name, df)
df_q = plot_df(search_results, query)
if df_q is not None:
dfs.append(df_q)
idx = better_index(search_results.indexes_scores, expected)
if idx is None:
precision = 0
recall = 0
else:
precision = 1 / (idx + 1)
recall = 1
precision_sum += precision
recall_sum += recall
if not os.path.isdir(OUT_DIR):
os.makedirs(OUT_DIR)
precision = precision_sum * 100 / len(test_set)
recall = recall_sum * 100 / len(test_set)
output = "Precision: {0:.2f}%\nRecall: {1:.2f}%\n".format(precision, recall)
print(output)
with open(os.path.join(OUT_DIR, "{0}_prec_recall.txt".format(method_name)), "w") as f:
f.write(output)
if len(dfs) > 0:
df = pd.concat(dfs)
plt.figure(figsize=(12, 10))
sns.scatterplot(
x="tsne-2d-one", y="tsne-2d-two",
hue="Query",
style="Vector kind",
palette=sns.color_palette("husl", n_colors=10),
data=df,
legend="full",
alpha=1.0
)
plt.savefig(os.path.join(OUT_DIR, "{0}_plot.png".format(method_name)))
return precision, recall
def main():
methods = ["tfidf", "freq", "lsi", "doc2vec"]
parser = argparse.ArgumentParser()
parser.add_argument("method", help="the method to compare similarities with", type=str, choices=methods + ["all"])
parser.add_argument("ground_truth_file", help="file where ground truth comes from", type=str)
args = parser.parse_args()
if args.method == "all":
df = pd.DataFrame(columns=["Engine", "Average Precision", "Average Recall"])
for i, method in enumerate(methods):
print(f"Applying method {method}:")
precision, recall = evaluate(method, args.ground_truth_file)
df.loc[i, "Engine"] = method
df.loc[i, "Average Precision"] = f"{precision:.2f}%"
df.loc[i, "Average Recall"] = f"{recall:.2f}%"
print(df.to_markdown(index=False))
else:
evaluate(args.method, args.ground_truth_file)
if __name__ == '__main__':
main()

BIN
report/main.pdf Normal file

Binary file not shown.

297
report/main.tex Normal file
View File

@ -0,0 +1,297 @@
%!TEX TS-program = pdflatexmk
\documentclass{article}
\usepackage{algorithm}
\usepackage{textcomp}
\usepackage{xcolor}
\usepackage{soul}
\usepackage{booktabs}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{microtype}
\usepackage{rotating}
\usepackage{graphicx}
\usepackage{paralist}
\usepackage{tabularx}
\usepackage{multicol}
\usepackage{multirow}
\usepackage{pbox}
\usepackage{enumitem}
\usepackage{colortbl}
\usepackage{pifont}
\usepackage{xspace}
\usepackage{url}
\usepackage{tikz}
\usepackage{fontawesome}
\usepackage{lscape}
\usepackage{listings}
\usepackage{color}
\usepackage{anyfontsize}
\usepackage{comment}
\usepackage{multibib}
\usepackage{float}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{changepage}
\usepackage{hyperref}
\title{Knowledge Management and Analysis \\ Project 01: Code Search}
\author{Claudio Maggioni}
\date{}
\begin{document}
\maketitle
\begin{adjustwidth}{-4cm}{-4cm}
\centering
\begin{tabular}{cc}
\toprule
Repository URL & \url{https://github.com/kamclassroom2022/project-01-multi-search-maggicl} \\
Commit ID & \texttt{b8e0a2c3c41249e45b233b55607e0b04ebe1bad0} \\ \bottomrule
\end{tabular}
\end{adjustwidth}
\vspace{1cm}
\subsection*{Section 1 - Data Extraction}
The data extraction (implemented in the script \texttt{extract-data.py}) process scans through the files in the
TensorFlow project to extract Python docstrings and symbol names for functions, classes and methods. A summary of the
number of features extracted can be found in table~\ref{tab:count1}. The collected figures show that the number of
classes is more than half the number of files, while the number of functions is about twice the number of files.
Additionally, the data shows that a class has slightly more than 2 methods in it on average.
\begin{table}[H]
\centering
\begin{tabular}{cc}
\toprule
Type & Number \\
\midrule
Python files & 2817 \\
Classes & 1882 \\
Functions & 4565 \\
Methods & 5817 \\
\bottomrule
\end{tabular}
\caption{Count of created classes and properties.}
\label{tab:count1}
\end{table}
\subsection*{Section 2: Training of search engines}
The training and model execution of the search engines is implemented in the Python script \texttt{search-data.py}.
The training model loads the data extracted by \texttt{extract-data.py} and uses as classification features the
identifier name and only the first line of the comment docstring. All other comment lines are filtered out as this
significantly increases performance when evaluating the models.
The script is able to search a given natural language query among the extracted TensorFlow corpus using four techniques.
These are namely: Word Frequency Similarity, Term-Frequency Inverse Document-Frequency (TF-IDF) Similarity, Latent
Semantic Indexing (LSI), and Doc2Vec.
An example output of results generated from the query ``Gather gpu device info'' for the word frequency, TF-IDF, LSI
and Doc2Vec models are shown in
figures~\ref{fig:search-freq},~\ref{fig:search-tfidf},~\ref{fig:search-lsi}~and~\ref{fig:search-doc2vec} respectively.
All four models are able to correctly report the ground truth required by the file \texttt{ground-truth-unique.txt} as
the first result with $>90\%$ similarity, with the except of the Doc2Vec model which reports $71.63\%$ similarity.
\begin{figure}[b]
\small
\begin{verbatim}
Similarity: 90.45%
Python function: gather_gpu_devices
Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
Line: 167
Similarity: 57.74%
Python function: gather_memory_info
Description: Gather memory info.
File: tensorflow/tensorflow/tools/test/system_info_lib.py
Line: 70
Similarity: 57.74%
Python function: gather_platform_info
Description: Gather platform info.
File: tensorflow/tensorflow/tools/test/system_info_lib.py
Line: 146
Similarity: 55.47%
Python function: compute_capability_from_device_desc
Description: Returns the GpuInfo given a DeviceAttributes proto. Args: devi...
File: tensorflow/tensorflow/python/framework/gpu_util.py
Line: 35
Similarity: 55.47%
Python function: gather_available_device_info
Description: Gather list of devices available to TensorFlow. Returns: A lis...
File: tensorflow/tensorflow/tools/test/system_info_lib.py
Line: 126
\end{verbatim}
\caption{Search result output for the query ``Gather gpu device info'' using the word frequency similarity model.}
\label{fig:search-freq}
\end{figure}
\begin{figure}[b]
\small
\begin{verbatim}
Similarity: 90.95%
Python function: gather_gpu_devices
Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
Line: 167
Similarity: 59.12%
Python function: gather_memory_info
Description: Gather memory info.
File: tensorflow/tensorflow/tools/test/system_info_lib.py
Line: 70
Similarity: 56.40%
Python function: gather_available_device_info
Description: Gather list of devices available to TensorFlow. Returns: A lis...
File: tensorflow/tensorflow/tools/test/system_info_lib.py
Line: 126
Similarity: 55.25%
Python function: gather_platform_info
Description: Gather platform info.
File: tensorflow/tensorflow/tools/test/system_info_lib.py
Line: 146
Similarity: 53.97%
Python function: info
File: tensorflow/tensorflow/python/platform/tf_logging.py
Line: 167
\end{verbatim}
\caption{Search result output for the query ``Gather gpu device info'' using the TF-IDF model.}
\label{fig:search-tfidf}
\end{figure}
\begin{figure}[b]
\small
\begin{verbatim}
Similarity: 98.38%
Python function: gather_gpu_devices
Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
Line: 167
Similarity: 97.66%
Python function: device
Description: Uses gpu when requested and available.
File: tensorflow/tensorflow/python/framework/test_util.py
Line: 1581
Similarity: 97.66%
Python function: device
Description: Uses gpu when requested and available.
File: tensorflow/tensorflow/python/keras/testing_utils.py
Line: 925
Similarity: 96.79%
Python class: ParallelDevice
Description: A device which executes operations in parallel.
File: tensorflow/tensorflow/python/distribute/parallel_device/parallel_device.py
Line: 42
Similarity: 96.67%
Python method: get_var_on_device
File: tensorflow/tensorflow/python/distribute/packed_distributed_variable.py
Line: 90
\end{verbatim}
\caption{Search result output for the query ``Gather gpu device info'' using the LSI model.}
\label{fig:search-lsi}
\end{figure}
\begin{figure}[b]
\small
\begin{verbatim}
Similarity: 71.63%
Python function: gather_gpu_devices
Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
Line: 167
Similarity: 66.71%
Python function: device
Description: Uses gpu when requested and available.
File: tensorflow/tensorflow/python/keras/testing_utils.py
Line: 925
Similarity: 65.23%
Python function: gpu_device_name
Description: Returns the name of a GPU device if available or the empty str...
File: tensorflow/tensorflow/python/framework/test_util.py
Line: 129
Similarity: 64.33%
Python function: gather_available_device_info
Description: Gather list of devices available to TensorFlow. Returns: A lis...
File: tensorflow/tensorflow/tools/test/system_info_lib.py
Line: 126
Similarity: 64.29%
Python method: hosts
Description: A list of device names for CPU hosts. Returns: A list of devic...
File: tensorflow/tensorflow/python/tpu/tpu_embedding.py
Line: 1011
\end{verbatim}
\caption{Search result output for the query ``Gather gpu device info'' using the Doc2Vec model.}
\label{fig:search-doc2vec}
\end{figure}
\subsection*{Section 3: Evaluation of search engines}
The evaluation over the given ground truth to compute precision, recall, and the T-SNE plots is performed by the script
\texttt{prec-recall.py}. The calculated average precision and recall values are reported in table~\ref{tab:tab2}.
Precision and recall are quite high for all models.
The word frequency model has the highest precision and recall ($93.33\%$ and $100.00\%$ respectively), while the Doc2Vec
model has the lowest precision ($73.33\%$) and lowest recall ($80.00\%$).
\begin{table}[H]
\centering
\begin{tabular}{ccc}
\toprule
Engine & Avg Precision & Recall \\
\midrule
Frequencies & 93.33\% & 100.00\% \\
TD-IDF & 90.00\% & 90.00\% \\
LSI & 90.00\% & 90.00\% \\
Doc2Vec & 73.33\% & 80.00\% \\
\bottomrule
\end{tabular}
\caption{Evaluation of search engines.}
\label{tab:tab2}
\end{table}
\subsection*{TBD Section 4: Visualisation of query results}
The two-dimensional T-SNE plots (computed with perplexity $= 2$) for the LSI and Doc2Vec models are respectively in
figures~\ref{fig:tsne-lsi}~and~\ref{fig:tsne-doc2vec}.
The T-SNE plot for the LSI model shows evidently the presence of outliers in the search result. The Doc2Vec plot shows
fewer outliers and more distinct clusters for the results of each query and the query vector itself. However, even
considering the good performance for both models, it is hard to distinguish from the plots given distinct ``regions''
where results and their respective query are located.
\begin{figure}
\begin{center}
\includegraphics[width=\textwidth]{../out/lsi_plot}
\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
\label{fig:tsne-lsi}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
\includegraphics[width=\textwidth]{../out/doc2vec_plot}
\caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
\label{fig:tsne-doc2vec}
\end{center}
\end{figure}
\end{document}

View File

@ -1,3 +1,9 @@
nltk==3.8.1
pandas==2.1.1
coloredlogs==15.0.1
gensim==4.3.2
nltk==3.8.1
numpy==1.26.1
pandas==2.1.2
tqdm==4.66.1
scikit-learn==1.3.2
seaborn==0.13.0
tabulate==0.9.0

View File

@ -1,109 +1,216 @@
import re
import argparse
import logging
import os
import pandas as pd
import re
import typing
from collections import defaultdict
from dataclasses import dataclass
from typing import Optional
import coloredlogs
import nltk
import numpy as np
from nltk.corpus import stopwords
from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
from gensim.models import TfidfModel, LsiModel, LdaModel
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import pandas as pd
from gensim.corpora import Dictionary
from collections import defaultdict
from gensim.models import TfidfModel, LsiModel
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.similarities import SparseMatrixSimilarity
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('stopwords', quiet=True)
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
# using ntlk stop words and example words for now
# using nltk stop words and example words for now
STOP_WORDS = set(stopwords.words('english')) \
.union(['test', 'tests', 'main', 'this'])
.union(['test', 'tests', 'main', 'this', 'self', 'int', 'get', 'set', 'new', 'return', 'list'])
def find_all(regex, word):
def find_all(regex: str, word: str, lower=True) -> list[str]:
matches = re.finditer(regex, word)
return [m.group(0).lower() for m in matches]
return [m.group(0).lower() if lower else m.group(0) for m in matches]
# https://stackoverflow.com/a/29920015
def camel_case_split(word):
def camel_case_split(word: str) -> list[str]:
return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
def identifier_split(identifier):
def identifier_split(identifier: str) -> list[str]:
return [y for x in identifier.split("_") for y in camel_case_split(x)]
def comment_split(comment):
return find_all('[A-Za-z0-9]+', comment)
def comment_split(comment: Optional[float | str], is_comment=True) -> list[str]:
if (type(comment) == float and np.isnan(comment)) or comment is None:
return []
# Consider only first line of each comment. Increases performance significantly
if is_comment:
comment = str(comment).split("\n", maxsplit=2)[0]
# Camel case split within "words" found takes care of referenced type names in the docstring comment
return [s for word in find_all('[A-Za-z]+', comment, lower=False) for s in camel_case_split(word)]
def remove_stopwords(input_bow_list):
return [word for word in input_bow_list if word not in STOP_WORDS]
def remove_stopwords(input_bow_list: list[str]) -> list[str]:
return [word for word in input_bow_list if word not in STOP_WORDS and len(word) > 2]
def get_bow(data, split_f):
def get_bow(data: Optional[float | str], split_f) -> list[str]:
if data is None or (type(data) == float and np.isnan(data)):
return []
return remove_stopwords(split_f(data))
def print_sims(corpus, query, df, dictionary):
def pick_most_similar(corpus, query, dictionary) -> list[tuple[int, float]]:
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
sims = index[query]
pick_top = 5
print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top])
return sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]
def print_results(idxs_scores, df):
for idx, score in idxs_scores:
def print_results(indexes_scores: list[tuple[int, float]], df):
print("\n===== RESULTS: =====")
for idx, score in indexes_scores:
row = df.loc[idx]
print("Similarity: {s:2.02f}%".format(s=score*100))
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
.format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"]))
comment = row["comment"]
if type(comment) != str:
desc = ""
else:
comment = re.sub(re.compile(r'[\s\n]+', re.MULTILINE), ' ', comment)
desc = "Description: {c}\n".format(c=comment)
desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
print("\nSimilarity: {s:2.02f}%".format(s=score * 100))
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}"
.format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))
def search(query, method):
df = pd.read_csv(IN_DATASET)
def train_doc2vec(corpus_list):
dvdocs = [TaggedDocument(text, [i]) for i, text in enumerate(corpus_list)]
model = Doc2Vec(vector_size=300, epochs=50, sample=0)
model.build_vocab(dvdocs)
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
model.save(DOC2VEC_MODEL)
return model
def load_data(print_frequent=False) -> pd.DataFrame:
df = pd.read_csv(IN_DATASET, index_col=0)
df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split))
df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split))
if print_frequent:
freq = defaultdict(int)
for bow in df["name_bow"].tolist():
for i in bow:
freq[i] += 1
for bow in df["comment_bow"].tolist():
for i in bow:
freq[i] += 1
for key, value in sorted(freq.items(), key=lambda k: k[1], reverse=True)[:100]:
print(f"{value}: {key}")
return df
SparseVector = list[tuple[int, float]]
DenseVector = np.array
def to_dense(vector: SparseVector) -> DenseVector:
dense = [0.0] * len(vector)
for idx, value in vector:
dense[idx] = value
return np.array(dense)
@dataclass
class SearchResults:
indexes_scores: list[tuple[int, float]]
vectors: Optional[list[DenseVector]]
query_vector: Optional[DenseVector]
def __init__(self, indexes_values: list[tuple[int, float]], vectors: Optional[list[DenseVector]],
query_vector: Optional[DenseVector]):
self.indexes_scores = indexes_values
self.vectors = vectors
self.query_vector = query_vector
def search(query: str, method: str, df: pd.DataFrame) -> SearchResults:
corpus_list = []
for idx, row in df.iterrows():
for _, row in df.iterrows():
document_words = row["name_bow"] + row["comment_bow"]
corpus_list.append(document_words)
dictionary = Dictionary(corpus_list)
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
query_w = get_bow(query, comment_split)
query_bow = dictionary.doc2bow(query_w)
query_w = comment_split(query, is_comment=False)
dictionary = None
corpus_bow = None
query_bow = None
if method != "doc2vec":
dictionary = Dictionary(corpus_list)
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
query_bow = dictionary.doc2bow(query_w)
if method == "tfidf":
tfidf = TfidfModel(corpus_bow)
print_sims(tfidf[corpus_bow], tfidf[query_bow], df, dictionary)
return SearchResults(pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary), None, None)
elif method == "freq":
print_sims(corpus_bow, query_bow, df, dictionary)
return SearchResults(pick_most_similar(corpus_bow, query_bow, dictionary), None, None)
elif method == "lsi":
lsi = LsiModel(corpus_bow)
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
lsi = LsiModel(corpus_bow, num_topics=50)
corpus = typing.cast(list[SparseVector], lsi[corpus_bow])
results = pick_most_similar(corpus, lsi[query_bow], dictionary)
result_vectors: list[DenseVector] = [to_dense(corpus[idx]) for idx, _ in results]
return SearchResults(results, result_vectors, to_dense(lsi[query_bow]))
elif method == "doc2vec":
dvdocs = [TaggedDocument(bow, [i]) for i, bow in enumerate(corpus_bow)]
model = Doc2Vec(vector_size=50, min_count=2, epochs=100)
model.build_vocab(dvdocs)
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
dvquery = model.infer_vector(query_w)
print_results(model.dv.most_similar([dvquery], topn=5), df)
if os.path.exists(DOC2VEC_MODEL):
model = Doc2Vec.load(DOC2VEC_MODEL)
else:
model = train_doc2vec(corpus_list)
dv_query = model.infer_vector(query_w)
results = model.dv.most_similar([dv_query], topn=5)
result_vectors = [model.infer_vector(corpus_list[idx]) for idx, _ in results]
return SearchResults(results, result_vectors, dv_query)
else:
raise Error("method unknown")
raise ValueError("method unknown")
def main():
methods = ["tfidf", "freq", "lsi", "doc2vec"]
parser = argparse.ArgumentParser()
parser.add_argument("method", help="the method to compare similarities with", type=str)
parser.add_argument("method", help="the method to compare similarities with", type=str,
choices=methods + ["all"])
parser.add_argument("query", help="the query to search the corpus with", type=str)
parser.add_argument("-v", "--verbose", help="enable verbose logging", action='store_true')
args = parser.parse_args()
search(args.query, args.method)
if args.verbose:
coloredlogs.install()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
df = load_data()
if args.method == "all":
for method in methods:
print(f"Applying method {method}:")
results = search(args.query, method, df)
print_results(results.indexes_scores, df)
print()
else:
results = search(args.query, args.method, df)
print_results(results.indexes_scores, df)
if __name__ == "__main__":