Added commit id in report

added name
Models fixed
2023-11-08 22:31:16 +01:00 · 2023-11-08 22:25:13 +01:00 · 2023-11-08 22:11:43 +01:00 · 2023-11-07 15:07:15 +01:00 · 2023-11-07 12:35:27 +01:00 · 2023-11-07 11:48:00 +01:00
15 changed files with 1164 additions and 58 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,459 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
 env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+**/.DS_Store
+out/model/*.pt
+
+## Core latex/pdflatex auxiliary files:
+*.aux
+*.lof
+*.lot
+*.fls
+*.out
+*.toc
+*.fmt
+*.fot
+*.cb
+*.cb2
+.*.lb
+
+## Intermediate documents:
+*.dvi
+*.xdv
+*-converted-to.*
+# these rules might exclude image files for figures etc.
+# *.ps
+# *.eps
+# *.pdf
+
+## Generated if empty string is given at "Please type another file name for output:"
+
+## Bibliography auxiliary files (bibtex/biblatex/biber):
+*.bbl
+*.bcf
+*.blg
+*-blx.aux
+*-blx.bib
+*.run.xml
+
+## Build tool auxiliary files:
+*.fdb_latexmk
+*.synctex
+*.synctex(busy)
+*.synctex.gz
+*.synctex.gz(busy)
+*.pdfsync
+
+## Build tool directories for auxiliary files
+# latexrun
+latex.out/
+
+## Auxiliary and intermediate files from other packages:
+# algorithms
+*.alg
+*.loa
+
+# achemso
+acs-*.bib
+
+# amsthm
+*.thm
+
+# beamer
+*.nav
+*.pre
+*.snm
+*.vrb
+
+# changes
+*.soc
+
+# comment
+*.cut
+
+# cprotect
+*.cpt
+
+# elsarticle (documentclass of Elsevier journals)
+*.spl
+
+# endnotes
+*.ent
+
+*.lox
+
+# feynmf/feynmp
+*.mf
+*.mp
+*.t[1-9]
+*.t[1-9][0-9]
+*.tfm
+
+#(r)(e)ledmac/(r)(e)ledpar
+*.end
+*.?end
+*.[1-9]
+*.[1-9][0-9]
+*.[1-9][0-9][0-9]
+*.[1-9]R
+*.[1-9][0-9]R
+*.[1-9][0-9][0-9]R
+*.eledsec[1-9]
+*.eledsec[1-9]R
+*.eledsec[1-9][0-9]
+*.eledsec[1-9][0-9]R
+*.eledsec[1-9][0-9][0-9]
+*.eledsec[1-9][0-9][0-9]R
+
+# glossaries
+*.acn
+*.acr
+*.glg
+*.glo
+*.gls
+*.glsdefs
+*.lzo
+*.lzs
+*.slg
+*.slo
+*.sls
+
+# uncomment this for glossaries-extra (will ignore makeindex's style files!)
+# *.ist
+
+# gnuplot
+*.gnuplot
+*.table
+
+# gnuplottex
+*-gnuplottex-*
+
+# gregoriotex
+*.gaux
+*.glog
+*.gtex
+
+# htlatex
+*.4ct
+*.4tc
+*.idv
+*.lg
+*.trc
+*.xref
+
+# hyperref
+*.brf
+
+# knitr
+*-concordance.tex
+# *.tikz
+*-tikzDictionary
+
+# listings
+*.lol
+
+# luatexja-ruby
+*.ltjruby
+
+# makeidx
+*.idx
+*.ilg
+*.ind
+
+# minitoc
+*.maf
+*.mlf
+*.mlt
+*.mtc[0-9]*
+*.slf[0-9]*
+*.slt[0-9]*
+*.stc[0-9]*
+
+# minted
+_minted*
+*.pyg
+
+# morewrites
+*.mw
+
+# newpax
+*.newpax
+
+# nomencl
+*.nlg
+*.nlo
+*.nls
+
+# pax
+*.pax
+
+# pdfpcnotes
+*.pdfpc
+
+# sagetex
+*.sagetex.sage
+*.sagetex.py
+*.sagetex.scmd
+
+# scrwfile
+*.wrt
+
+# svg
+svg-inkscape/
+
+# sympy
+*.sout
+*.sympy
+sympy-plots-for-*.tex/
+
+# pdfcomment
+*.upa
+*.upb
+
+# pythontex
+*.pytxcode
+pythontex-files-*/
+
+# tcolorbox
+*.listing
+
+# thmtools
+*.loe
+
+# TikZ & PGF
+*.dpth
+*.md5
+*.auxlock
+
+# titletoc
+*.ptc
+
+# todonotes
+*.tdo
+
+# vhistory
+*.hst
+*.ver
+
+*.lod
+
+# xcolor
+*.xcp
+
+# xmpincl
+*.xmpi
+
+# xindy
+*.xdy
+
+# xypic precompiled matrices and outlines
+*.xyc
+*.xyd
+
+# endfloat
+*.ttt
+*.fff
+
+# Latexian
+TSWLatexianTemp*
+
+## Editors:
+# WinEdt
+*.bak
+*.sav
+
+# Texpad
+.texpadtmp
+
+# LyX
+*.lyx~
+
+# Kile
+*.backup
+
+# gummi
+.*.swp
+
+# KBibTeX
+*~[0-9]*
+
+# TeXnicCenter
+*.tps
+
+# auto folder when using emacs and auctex
+./auto/*
+*.el
+
+# expex forward references with \gathertags
+*-tags.tex
+
+# standalone packages
+*.sta
+
+# Makeindex log files
+*.lpz
+
+# xwatermark package
+*.xwm
+
+# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
+# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
+# Uncomment the next line to have this generated file ignored.
+#*Notes.bib
--- a/README.md
+++ b/README.md
@ -1,14 +1,85 @@
 # Project 02: Multi-source code search

+**Claudio Maggioni**
+
 ### About the Project

-This project has the goal of developing a search engine able to query a large Python code repository using multiple sources of information. 
+This project has the goal of developing a search engine able to query a large Python code repository using multiple
+sources of information.
 It is part of the Knowledge Analysis & Management - 2022 course from the Università della Svizzera italiana.

 In this repository, you can find the following files:
+
 - tensor flow: a code repository to be used during this project
- ground-truth-unique: a file containing the references triples necessary to evaluate the search engine (step 3) 
+- ground-truth-unique: a file containing the references triples necessary to evaluate the search engine (step 3)

 For more information, see the Project-02 slides (available on iCourse)

 Note: Feel free to modify this file according to the project's necessities.
+
+## Environment setup
+
+To install the required dependencies make sure `python3` points to a Python 3.10 or 3.11 installation and then run:
+
+```shell
+python3 -m venv env
+source env/bin/activate
+pip install -r requirements.txt
+```
+
+## Part 1: data extraction
+
+To extract the data in file `data.csv` run the command:
+
+```shell
+python3 extract-data.py
+```
+
+The script prints the requested counts, which are namely:
+
+```
+Methods: 5817
+Functions: 4565
+Classes: 1882
+Python Files: 2817
+```
+
+## Part 2: Training
+
+In order to train and predict the output of a given query run the command:
+
+```shell
+python3 search-data.py [method] "[query]"
+```
+
+where `[method]` is one of `{tfidf,freq,lsi,doc2vec}` or `all` to run all classifiers and `[query]` is the natural
+language query to search. Outputs are printed on stdout, and in case of `doc2vec` the trained model file is saved in
+`./doc2vec_model.dat` and fetched in this path for subsequent executions.
+
+## Part 3: Evaluation
+
+To evaluate a model run the command:
+
+```shell
+python3 search-data.py [method] ./ground-truth-unique.txt
+```
+
+where `[method]` is one of `{tfidf,freq,lsi,doc2vec}` or `all` to evaluate all classifiers. The script outputs the
+performance of the classifiers in terms of average precision and recall, which are namely:
+
+| Engine   | Average Precision   | Average Recall   |
+|:---------|:--------------------|:-----------------|
+| tfidf    | 90.00%              | 90.00%           |
+| freq     | 93.33%              | 100.00%          |
+| lsi      | 90.00%              | 90.00%           |
+| doc2vec  | 73.33%              | 80.00%           |
+
+## Report
+
+To compile the report run:
+
+```shell
+cd report
+pdflatex -interaction=nonstopmode -output-directory=. main.tex
+pdflatex -interaction=nonstopmode -output-directory=. main.tex
+```
--- a/doc2vec_model.dat
+++ b/doc2vec_model.dat
--- a/extract-data.py
+++ b/extract-data.py
@ -15,7 +15,7 @@ def find_py_files(dir):


 def keep_name(name):
-    return not name.startswith("_") and not "main" in str(name).lower() and \
+    return not name.startswith("_") and "main" not in str(name).lower() and \
        "test" not in str(name).lower()


@ -28,7 +28,7 @@ class FeatureVisitor(ast.NodeVisitor):
    def visit_FunctionDef(self, node):
        if keep_name(node.name):
            self.rows.append({
-                "name": node.name, 
+                "name": node.name,
                "file": self.filename,
                "line": node.lineno,
                "type": "function",
@ -56,14 +56,14 @@ class FeatureVisitor(ast.NodeVisitor):
                        })


-
 def main():
    df = pd.DataFrame(columns=["name", "file", "line", "type", "comment"])
-    
-    for file in find_py_files(IN_DIR):
+    files = list(find_py_files(IN_DIR))
+
+    for file in files:
        with open(file, "r") as f:
            py_source = f.read()
-       
+
        py_ast = ast.parse(py_source)

        visitor = FeatureVisitor(file)
@ -71,6 +71,16 @@ def main():
        df_visitor = pd.DataFrame.from_records(visitor.rows)
        df = pd.concat([df, df_visitor])

+    counts = df["type"].apply(lambda ft: {
+        "function": "Functions",
+        "class": "Classes",
+        "method": "Methods"
+    }[ft]).value_counts().to_dict()
+    counts["Python Files"] = len(files)
+
+    for file_type, name in counts.items():
+        print(f"{file_type}: {name}")
+
    df.reset_index(drop=True).to_csv(OUT_FILE)


--- a/out/doc2vec_plot.png
+++ b/out/doc2vec_plot.png
--- a/out/doc2vec_prec_recall.txt
+++ b/out/doc2vec_prec_recall.txt
@ -0,0 +1,2 @@
+Precision: 73.33%
+Recall: 80.00%
--- a/out/freq_prec_recall.txt
+++ b/out/freq_prec_recall.txt
@ -0,0 +1,2 @@
+Precision: 93.33%
+Recall: 100.00%
--- a/out/lsi_plot.png
+++ b/out/lsi_plot.png
--- a/out/lsi_prec_recall.txt
+++ b/out/lsi_prec_recall.txt
@ -0,0 +1,2 @@
+Precision: 90.00%
+Recall: 90.00%
--- a/out/tfidf_prec_recall.txt
+++ b/out/tfidf_prec_recall.txt
@ -0,0 +1,2 @@
+Precision: 90.00%
+Recall: 90.00%
--- a/prec-recall.py
+++ b/prec-recall.py
@ -0,0 +1,149 @@
+import argparse
+import os.path
+from typing import Iterable, Optional
+
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import tqdm
+from matplotlib import pyplot as plt
+from sklearn.manifold import TSNE
+
+search_data = __import__('search-data')
+
+TENSORFLOW_PATH_PREFIX: str = "./"
+OUT_DIR: str = os.path.join(os.path.dirname(__file__), "out")
+
+
+def read_ground_truth(file_path: str, df: pd.DataFrame) -> Iterable[tuple[str, int]]:
+    records: list[list[str]] = []
+
+    with open(file_path) as f:
+        record_tmp = []
+        for line in f:
+            line = line.strip()
+            if line == '':
+                assert len(record_tmp) == 3
+                records.append(record_tmp)
+                record_tmp = []
+            else:
+                record_tmp.append(line)
+
+    if len(record_tmp) == 3:
+        records.append(record_tmp)
+
+    for query, name, file_name in records:
+        assert file_name.startswith(TENSORFLOW_PATH_PREFIX)
+        file_name = file_name[len(TENSORFLOW_PATH_PREFIX):]
+
+        row = df[(df.name == name) & (df.file == file_name)]
+        assert len(row) == 1
+
+        yield query, row.index[0]
+
+
+def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
+    for i, le in enumerate(li):
+        if le[0] == e:
+            return i
+
+    return None
+
+
+def plot_df(results, query: str) -> Optional[pd.DataFrame]:
+    if results.vectors is not None and results.query_vector is not None:
+        tsne_vectors = np.array(results.vectors + [results.query_vector])
+        tsne = TSNE(n_components=2, perplexity=2, n_iter=3000)
+        tsne_results = tsne.fit_transform(tsne_vectors)
+        df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'Query', 'Vector kind'])
+        df['tsne-2d-one'] = tsne_results[:, 0]
+        df['tsne-2d-two'] = tsne_results[:, 1]
+        df['Query'] = [query] * (len(results.vectors) + 1)
+        df['Vector kind'] = (['Result'] * len(results.vectors)) + ['Input query']
+        return df
+    else:
+        return None
+
+
+def evaluate(method_name: str, file_path: str) -> tuple[float, float]:
+    df = search_data.load_data()
+    test_set = list(read_ground_truth(file_path, df))
+
+    precision_sum = 0
+    recall_sum = 0
+
+    dfs = []
+
+    for query, expected in tqdm.tqdm(test_set):
+        search_results = search_data.search(query, method_name, df)
+
+        df_q = plot_df(search_results, query)
+        if df_q is not None:
+            dfs.append(df_q)
+
+        idx = better_index(search_results.indexes_scores, expected)
+
+        if idx is None:
+            precision = 0
+            recall = 0
+        else:
+            precision = 1 / (idx + 1)
+            recall = 1
+
+        precision_sum += precision
+        recall_sum += recall
+
+    if not os.path.isdir(OUT_DIR):
+        os.makedirs(OUT_DIR)
+
+    precision = precision_sum * 100 / len(test_set)
+    recall = recall_sum * 100 / len(test_set)
+
+    output = "Precision: {0:.2f}%\nRecall: {1:.2f}%\n".format(precision, recall)
+
+    print(output)
+    with open(os.path.join(OUT_DIR, "{0}_prec_recall.txt".format(method_name)), "w") as f:
+        f.write(output)
+
+    if len(dfs) > 0:
+        df = pd.concat(dfs)
+        plt.figure(figsize=(12, 10))
+        sns.scatterplot(
+            x="tsne-2d-one", y="tsne-2d-two",
+            hue="Query",
+            style="Vector kind",
+            palette=sns.color_palette("husl", n_colors=10),
+            data=df,
+            legend="full",
+            alpha=1.0
+        )
+        plt.savefig(os.path.join(OUT_DIR, "{0}_plot.png".format(method_name)))
+
+    return precision, recall
+
+
+def main():
+    methods = ["tfidf", "freq", "lsi", "doc2vec"]
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("method", help="the method to compare similarities with", type=str, choices=methods + ["all"])
+    parser.add_argument("ground_truth_file", help="file where ground truth comes from", type=str)
+    args = parser.parse_args()
+
+    if args.method == "all":
+        df = pd.DataFrame(columns=["Engine", "Average Precision", "Average Recall"])
+
+        for i, method in enumerate(methods):
+            print(f"Applying method {method}:")
+            precision, recall = evaluate(method, args.ground_truth_file)
+            df.loc[i, "Engine"] = method
+            df.loc[i, "Average Precision"] = f"{precision:.2f}%"
+            df.loc[i, "Average Recall"] = f"{recall:.2f}%"
+
+        print(df.to_markdown(index=False))
+    else:
+        evaluate(args.method, args.ground_truth_file)
+
+
+if __name__ == '__main__':
+    main()
--- a/report/main.pdf
+++ b/report/main.pdf
--- a/report/main.tex
+++ b/report/main.tex
@ -0,0 +1,297 @@
+%!TEX TS-program = pdflatexmk
+\documentclass{article}
+
+\usepackage{algorithm}
+\usepackage{textcomp}
+\usepackage{xcolor}
+\usepackage{soul}
+\usepackage{booktabs}
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage{microtype}
+\usepackage{rotating}
+\usepackage{graphicx}
+\usepackage{paralist}
+\usepackage{tabularx}
+\usepackage{multicol}
+\usepackage{multirow}
+\usepackage{pbox}
+\usepackage{enumitem}
+\usepackage{colortbl}
+\usepackage{pifont}
+\usepackage{xspace}
+\usepackage{url}
+\usepackage{tikz}
+\usepackage{fontawesome}
+\usepackage{lscape}
+\usepackage{listings}
+\usepackage{color}
+\usepackage{anyfontsize}
+\usepackage{comment}
+\usepackage{multibib}
+\usepackage{float}
+\usepackage{caption}
+\usepackage{subcaption}
+\usepackage{amssymb}
+\usepackage{amsmath}
+\usepackage{changepage}
+\usepackage{hyperref}
+
+\title{Knowledge Management and Analysis \\ Project 01: Code Search}
+\author{Claudio Maggioni}
+\date{}
+
+\begin{document}
+
+    \maketitle
+
+    \begin{adjustwidth}{-4cm}{-4cm}
+        \centering
+        \begin{tabular}{cc}
+            \toprule
+            Repository URL & \url{https://github.com/kamclassroom2022/project-01-multi-search-maggicl} \\
+            Commit ID      & \texttt{b8e0a2c3c41249e45b233b55607e0b04ebe1bad0}                         \\ \bottomrule
+        \end{tabular}
+    \end{adjustwidth}
+    \vspace{1cm}
+
+
+    \subsection*{Section 1 - Data Extraction}
+
+    The data extraction (implemented in the script \texttt{extract-data.py}) process scans through the files in the
+    TensorFlow project to extract Python docstrings and symbol names for functions, classes and methods. A summary of the
+    number of features extracted can be found in table~\ref{tab:count1}. The collected figures show that the number of
+    classes is more than half the number of files, while the number of functions is about twice the number of files.
+    Additionally, the data shows that a class has slightly more than 2 methods in it on average.
+
+    \begin{table}[H]
+        \centering
+        \begin{tabular}{cc}
+            \toprule
+            Type         & Number \\
+            \midrule
+            Python files & 2817   \\
+            Classes      & 1882   \\
+            Functions    & 4565   \\
+            Methods      & 5817   \\
+            \bottomrule
+        \end{tabular}
+        \caption{Count of created classes and properties.}
+        \label{tab:count1}
+    \end{table}
+
+    \subsection*{Section 2: Training of search engines}
+
+    The training and model execution of the search engines is implemented in the Python script \texttt{search-data.py}.
+    The training model loads the data extracted by \texttt{extract-data.py} and uses as classification features the
+    identifier name and only the first line of the comment docstring. All other comment lines are filtered out as this
+    significantly increases performance when evaluating the models.
+
+    The script is able to search a given natural language query among the extracted TensorFlow corpus using four techniques.
+    These are namely: Word Frequency Similarity, Term-Frequency Inverse Document-Frequency (TF-IDF) Similarity, Latent
+    Semantic Indexing (LSI), and Doc2Vec.
+
+    An example output of results generated from the query ``Gather gpu device info'' for the word frequency, TF-IDF, LSI
+    and Doc2Vec models are shown in
+    figures~\ref{fig:search-freq},~\ref{fig:search-tfidf},~\ref{fig:search-lsi}~and~\ref{fig:search-doc2vec} respectively.
+    All four models are able to correctly report the ground truth required by the file \texttt{ground-truth-unique.txt} as
+    the first result with $>90\%$ similarity, with the except of the Doc2Vec model which reports $71.63\%$ similarity.
+
+    \begin{figure}[b]
+        \small
+        \begin{verbatim}
+Similarity: 90.45%
+Python function: gather_gpu_devices
+Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
+File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
+Line: 167
+
+Similarity: 57.74%
+Python function: gather_memory_info
+Description: Gather memory info.
+File: tensorflow/tensorflow/tools/test/system_info_lib.py
+Line: 70
+
+Similarity: 57.74%
+Python function: gather_platform_info
+Description: Gather platform info.
+File: tensorflow/tensorflow/tools/test/system_info_lib.py
+Line: 146
+
+Similarity: 55.47%
+Python function: compute_capability_from_device_desc
+Description: Returns the GpuInfo given a DeviceAttributes proto. Args: devi...
+File: tensorflow/tensorflow/python/framework/gpu_util.py
+Line: 35
+
+Similarity: 55.47%
+Python function: gather_available_device_info
+Description: Gather list of devices available to TensorFlow. Returns: A lis...
+File: tensorflow/tensorflow/tools/test/system_info_lib.py
+Line: 126
+        \end{verbatim}
+        \caption{Search result output for the query ``Gather gpu device info'' using the word frequency similarity model.}
+        \label{fig:search-freq}
+    \end{figure}
+
+    \begin{figure}[b]
+        \small
+        \begin{verbatim}
+Similarity: 90.95%
+Python function: gather_gpu_devices
+Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
+File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
+Line: 167
+
+Similarity: 59.12%
+Python function: gather_memory_info
+Description: Gather memory info.
+File: tensorflow/tensorflow/tools/test/system_info_lib.py
+Line: 70
+
+Similarity: 56.40%
+Python function: gather_available_device_info
+Description: Gather list of devices available to TensorFlow. Returns: A lis...
+File: tensorflow/tensorflow/tools/test/system_info_lib.py
+Line: 126
+
+Similarity: 55.25%
+Python function: gather_platform_info
+Description: Gather platform info.
+File: tensorflow/tensorflow/tools/test/system_info_lib.py
+Line: 146
+
+Similarity: 53.97%
+Python function: info
+File: tensorflow/tensorflow/python/platform/tf_logging.py
+Line: 167
+        \end{verbatim}
+        \caption{Search result output for the query ``Gather gpu device info'' using the TF-IDF model.}
+        \label{fig:search-tfidf}
+    \end{figure}
+
+    \begin{figure}[b]
+        \small
+        \begin{verbatim}
+Similarity: 98.38%
+Python function: gather_gpu_devices
+Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
+File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
+Line: 167
+
+Similarity: 97.66%
+Python function: device
+Description: Uses gpu when requested and available.
+File: tensorflow/tensorflow/python/framework/test_util.py
+Line: 1581
+
+Similarity: 97.66%
+Python function: device
+Description: Uses gpu when requested and available.
+File: tensorflow/tensorflow/python/keras/testing_utils.py
+Line: 925
+
+Similarity: 96.79%
+Python class: ParallelDevice
+Description: A device which executes operations in parallel.
+File: tensorflow/tensorflow/python/distribute/parallel_device/parallel_device.py
+Line: 42
+
+Similarity: 96.67%
+Python method: get_var_on_device
+File: tensorflow/tensorflow/python/distribute/packed_distributed_variable.py
+Line: 90
+        \end{verbatim}
+        \caption{Search result output for the query ``Gather gpu device info'' using the LSI model.}
+        \label{fig:search-lsi}
+    \end{figure}
+
+    \begin{figure}[b]
+        \small
+        \begin{verbatim}
+Similarity: 71.63%
+Python function: gather_gpu_devices
+Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
+File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
+Line: 167
+
+Similarity: 66.71%
+Python function: device
+Description: Uses gpu when requested and available.
+File: tensorflow/tensorflow/python/keras/testing_utils.py
+Line: 925
+
+Similarity: 65.23%
+Python function: gpu_device_name
+Description: Returns the name of a GPU device if available or the empty str...
+File: tensorflow/tensorflow/python/framework/test_util.py
+Line: 129
+
+Similarity: 64.33%
+Python function: gather_available_device_info
+Description: Gather list of devices available to TensorFlow. Returns: A lis...
+File: tensorflow/tensorflow/tools/test/system_info_lib.py
+Line: 126
+
+Similarity: 64.29%
+Python method: hosts
+Description: A list of device names for CPU hosts. Returns: A list of devic...
+File: tensorflow/tensorflow/python/tpu/tpu_embedding.py
+Line: 1011
+        \end{verbatim}
+        \caption{Search result output for the query ``Gather gpu device info'' using the Doc2Vec model.}
+        \label{fig:search-doc2vec}
+    \end{figure}
+
+    \subsection*{Section 3: Evaluation of search engines}
+
+    The evaluation over the given ground truth to compute precision, recall, and the T-SNE plots is performed by the script
+    \texttt{prec-recall.py}. The calculated average precision and recall values are reported in table~\ref{tab:tab2}.
+
+    Precision and recall are quite high for all models.
+    The word frequency model has the highest precision and recall ($93.33\%$ and $100.00\%$ respectively), while the Doc2Vec
+    model has the lowest precision ($73.33\%$) and lowest recall ($80.00\%$).
+
+    \begin{table}[H]
+        \centering
+        \begin{tabular}{ccc}
+            \toprule
+            Engine      & Avg Precision & Recall   \\
+            \midrule
+            Frequencies & 93.33\%       & 100.00\% \\
+            TD-IDF      & 90.00\%       & 90.00\%  \\
+            LSI         & 90.00\%       & 90.00\%  \\
+            Doc2Vec     & 73.33\%       & 80.00\%  \\
+            \bottomrule
+        \end{tabular}
+        \caption{Evaluation of search engines.}
+        \label{tab:tab2}
+    \end{table}
+
+    \subsection*{TBD Section 4: Visualisation of query results}
+
+    The two-dimensional T-SNE plots (computed with perplexity $= 2$) for the LSI and Doc2Vec models are respectively in
+    figures~\ref{fig:tsne-lsi}~and~\ref{fig:tsne-doc2vec}.
+
+    The T-SNE plot for the LSI model shows evidently the presence of outliers in the search result. The Doc2Vec plot shows
+    fewer outliers and more distinct clusters for the results of each query and the query vector itself. However, even
+    considering the good performance for both models, it is hard to distinguish from the plots given distinct ``regions''
+    where results and their respective query are located.
+
+    \begin{figure}
+        \begin{center}
+            \includegraphics[width=\textwidth]{../out/lsi_plot}
+            \caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
+            \label{fig:tsne-lsi}
+        \end{center}
+    \end{figure}
+
+    \begin{figure}
+        \begin{center}
+            \includegraphics[width=\textwidth]{../out/doc2vec_plot}
+            \caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
+            \label{fig:tsne-doc2vec}
+        \end{center}
+    \end{figure}
+
+\end{document}
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,9 @@
-nltk==3.8.1
-pandas==2.1.1
+coloredlogs==15.0.1
 gensim==4.3.2
+nltk==3.8.1
+numpy==1.26.1
+pandas==2.1.2
+tqdm==4.66.1
+scikit-learn==1.3.2
+seaborn==0.13.0
+tabulate==0.9.0
--- a/search-data.py
+++ b/search-data.py
@ -1,109 +1,216 @@
-import re
 import argparse
+import logging
 import os
-import pandas as pd
+import re
+import typing
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Optional
+
+import coloredlogs
 import nltk
 import numpy as np
-from nltk.corpus import stopwords
-from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
-from gensim.models import TfidfModel, LsiModel, LdaModel
-from gensim.models.doc2vec import TaggedDocument, Doc2Vec
+import pandas as pd
 from gensim.corpora import Dictionary
-from collections import defaultdict
+from gensim.models import TfidfModel, LsiModel
+from gensim.models.doc2vec import TaggedDocument, Doc2Vec
+from gensim.similarities import SparseMatrixSimilarity
+from nltk.corpus import stopwords

-nltk.download('stopwords')
+nltk.download('stopwords', quiet=True)

 SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
 IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
+DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")

-# using ntlk stop words and example words for now
+# using nltk stop words and example words for now
 STOP_WORDS = set(stopwords.words('english')) \
-    .union(['test', 'tests', 'main', 'this'])
+    .union(['test', 'tests', 'main', 'this', 'self', 'int', 'get', 'set', 'new', 'return', 'list'])


-def find_all(regex, word):
+def find_all(regex: str, word: str, lower=True) -> list[str]:
    matches = re.finditer(regex, word)
-    return [m.group(0).lower() for m in matches]
+    return [m.group(0).lower() if lower else m.group(0) for m in matches]


 # https://stackoverflow.com/a/29920015
-def camel_case_split(word):
+def camel_case_split(word: str) -> list[str]:
    return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)


-def identifier_split(identifier):
+def identifier_split(identifier: str) -> list[str]:
    return [y for x in identifier.split("_") for y in camel_case_split(x)]


-def comment_split(comment):
-    return find_all('[A-Za-z0-9]+', comment)
+def comment_split(comment: Optional[float | str], is_comment=True) -> list[str]:
+    if (type(comment) == float and np.isnan(comment)) or comment is None:
+        return []
+
+    # Consider only first line of each comment. Increases performance significantly
+    if is_comment:
+        comment = str(comment).split("\n", maxsplit=2)[0]
+
+    # Camel case split within "words" found takes care of referenced type names in the docstring comment
+    return [s for word in find_all('[A-Za-z]+', comment, lower=False) for s in camel_case_split(word)]


-def remove_stopwords(input_bow_list):
-    return [word for word in input_bow_list if word not in STOP_WORDS]
+def remove_stopwords(input_bow_list: list[str]) -> list[str]:
+    return [word for word in input_bow_list if word not in STOP_WORDS and len(word) > 2]


-def get_bow(data, split_f):
+def get_bow(data: Optional[float | str], split_f) -> list[str]:
    if data is None or (type(data) == float and np.isnan(data)):
        return []
    return remove_stopwords(split_f(data))


-def print_sims(corpus, query, df, dictionary):
+def pick_most_similar(corpus, query, dictionary) -> list[tuple[int, float]]:
    index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
    sims = index[query]
    pick_top = 5
-    
-    print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top])
+    return sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]

-def print_results(idxs_scores, df):
-    for idx, score in idxs_scores:
+
+def print_results(indexes_scores: list[tuple[int, float]], df):
+    print("\n===== RESULTS: =====")
+
+    for idx, score in indexes_scores:
        row = df.loc[idx]
-        print("Similarity: {s:2.02f}%".format(s=score*100))
-        print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
-              .format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"]))
+
+        comment = row["comment"]
+        if type(comment) != str:
+            desc = ""
+        else:
+            comment = re.sub(re.compile(r'[\s\n]+', re.MULTILINE), ' ', comment)
+            desc = "Description: {c}\n".format(c=comment)
+            desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
+
+        print("\nSimilarity: {s:2.02f}%".format(s=score * 100))
+        print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}"
+              .format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))


-def search(query, method):
-    df = pd.read_csv(IN_DATASET)
+def train_doc2vec(corpus_list):
+    dvdocs = [TaggedDocument(text, [i]) for i, text in enumerate(corpus_list)]
+    model = Doc2Vec(vector_size=300, epochs=50, sample=0)
+    model.build_vocab(dvdocs)
+    model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
+    model.save(DOC2VEC_MODEL)
+    return model
+
+
+def load_data(print_frequent=False) -> pd.DataFrame:
+    df = pd.read_csv(IN_DATASET, index_col=0)
    df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split))
    df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split))

+    if print_frequent:
+        freq = defaultdict(int)
+        for bow in df["name_bow"].tolist():
+            for i in bow:
+                freq[i] += 1
+
+        for bow in df["comment_bow"].tolist():
+            for i in bow:
+                freq[i] += 1
+
+        for key, value in sorted(freq.items(), key=lambda k: k[1], reverse=True)[:100]:
+            print(f"{value}: {key}")
+
+    return df
+
+
+SparseVector = list[tuple[int, float]]
+DenseVector = np.array
+
+
+def to_dense(vector: SparseVector) -> DenseVector:
+    dense = [0.0] * len(vector)
+    for idx, value in vector:
+        dense[idx] = value
+    return np.array(dense)
+
+
+@dataclass
+class SearchResults:
+    indexes_scores: list[tuple[int, float]]
+    vectors: Optional[list[DenseVector]]
+    query_vector: Optional[DenseVector]
+
+    def __init__(self, indexes_values: list[tuple[int, float]], vectors: Optional[list[DenseVector]],
+                 query_vector: Optional[DenseVector]):
+        self.indexes_scores = indexes_values
+        self.vectors = vectors
+        self.query_vector = query_vector
+
+
+def search(query: str, method: str, df: pd.DataFrame) -> SearchResults:
    corpus_list = []
-    for idx, row in df.iterrows():
+    for _, row in df.iterrows():
        document_words = row["name_bow"] + row["comment_bow"]
        corpus_list.append(document_words)

-    dictionary = Dictionary(corpus_list)
-    corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
-    query_w = get_bow(query, comment_split)
-    query_bow = dictionary.doc2bow(query_w)
-   
+    query_w = comment_split(query, is_comment=False)
+    dictionary = None
+    corpus_bow = None
+    query_bow = None
+
+    if method != "doc2vec":
+        dictionary = Dictionary(corpus_list)
+        corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
+        query_bow = dictionary.doc2bow(query_w)
+
    if method == "tfidf":
        tfidf = TfidfModel(corpus_bow)
-        print_sims(tfidf[corpus_bow], tfidf[query_bow], df, dictionary)
+        return SearchResults(pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary), None, None)
    elif method == "freq":
-        print_sims(corpus_bow, query_bow, df, dictionary)
+        return SearchResults(pick_most_similar(corpus_bow, query_bow, dictionary), None, None)
    elif method == "lsi":
-        lsi = LsiModel(corpus_bow)
-        print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
+        lsi = LsiModel(corpus_bow, num_topics=50)
+        corpus = typing.cast(list[SparseVector], lsi[corpus_bow])
+        results = pick_most_similar(corpus, lsi[query_bow], dictionary)
+        result_vectors: list[DenseVector] = [to_dense(corpus[idx]) for idx, _ in results]
+        return SearchResults(results, result_vectors, to_dense(lsi[query_bow]))
    elif method == "doc2vec":
-        dvdocs = [TaggedDocument(bow, [i]) for i, bow in enumerate(corpus_bow)]
-        model = Doc2Vec(vector_size=50, min_count=2, epochs=100)
-        model.build_vocab(dvdocs)
-        model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
-        dvquery = model.infer_vector(query_w)
-        print_results(model.dv.most_similar([dvquery], topn=5), df)
+        if os.path.exists(DOC2VEC_MODEL):
+            model = Doc2Vec.load(DOC2VEC_MODEL)
+        else:
+            model = train_doc2vec(corpus_list)
+
+        dv_query = model.infer_vector(query_w)
+        results = model.dv.most_similar([dv_query], topn=5)
+
+        result_vectors = [model.infer_vector(corpus_list[idx]) for idx, _ in results]
+        return SearchResults(results, result_vectors, dv_query)
    else:
-        raise Error("method unknown")
+        raise ValueError("method unknown")


 def main():
+    methods = ["tfidf", "freq", "lsi", "doc2vec"]
+
    parser = argparse.ArgumentParser()
-    parser.add_argument("method", help="the method to compare similarities with", type=str)
+    parser.add_argument("method", help="the method to compare similarities with", type=str,
+                        choices=methods + ["all"])
    parser.add_argument("query", help="the query to search the corpus with", type=str)
+    parser.add_argument("-v", "--verbose", help="enable verbose logging", action='store_true')
    args = parser.parse_args()
-    search(args.query, args.method)
+
+    if args.verbose:
+        coloredlogs.install()
+        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+
+    df = load_data()
+
+    if args.method == "all":
+        for method in methods:
+            print(f"Applying method {method}:")
+            results = search(args.query, method, df)
+            print_results(results.indexes_scores, df)
+            print()
+    else:
+        results = search(args.query, args.method, df)
+        print_results(results.indexes_scores, df)


 if __name__ == "__main__":
Author	SHA1	Message	Date
Claudio Maggioni	7253cf8590	Added commit id in report	2023-11-08 22:31:16 +01:00
Claudio Maggioni	b8e0a2c3c4	added name	2023-11-08 22:25:13 +01:00
Claudio Maggioni	f374d2eeb5	Models fixed	2023-11-08 22:11:43 +01:00
Claudio Maggioni	a288957112	report done	2023-11-07 15:07:15 +01:00
Claudio Maggioni	ccda6c1c09	Report section 1 and 2 done	2023-11-07 12:35:27 +01:00
Claudio Maggioni	dab12ddca7	wip report	2023-11-07 11:48:00 +01:00
Claudio Maggioni	9d8dd05428	done part 4	2023-10-25 15:45:04 +02:00
Claudio Maggioni	297f20d85e	almost done part 4	2023-10-25 15:10:47 +02:00
Claudio Maggioni	c644888371	part 3 done	2023-10-23 15:42:25 +02:00
Claudio Maggioni	8de7663a8a	doc2vec executes	2023-10-16 16:36:25 +02:00