Compare commits
10 Commits
b244744c21
...
7253cf8590
Author | SHA1 | Date |
---|---|---|
Claudio Maggioni | 7253cf8590 | |
Claudio Maggioni | b8e0a2c3c4 | |
Claudio Maggioni | f374d2eeb5 | |
Claudio Maggioni | a288957112 | |
Claudio Maggioni | ccda6c1c09 | |
Claudio Maggioni | dab12ddca7 | |
Claudio Maggioni | 9d8dd05428 | |
Claudio Maggioni | 297f20d85e | |
Claudio Maggioni | c644888371 | |
Claudio Maggioni | 8de7663a8a |
|
@ -1 +1,459 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
**/.DS_Store
|
||||
out/model/*.pt
|
||||
|
||||
## Core latex/pdflatex auxiliary files:
|
||||
*.aux
|
||||
*.lof
|
||||
*.lot
|
||||
*.fls
|
||||
*.out
|
||||
*.toc
|
||||
*.fmt
|
||||
*.fot
|
||||
*.cb
|
||||
*.cb2
|
||||
.*.lb
|
||||
|
||||
## Intermediate documents:
|
||||
*.dvi
|
||||
*.xdv
|
||||
*-converted-to.*
|
||||
# these rules might exclude image files for figures etc.
|
||||
# *.ps
|
||||
# *.eps
|
||||
# *.pdf
|
||||
|
||||
## Generated if empty string is given at "Please type another file name for output:"
|
||||
|
||||
## Bibliography auxiliary files (bibtex/biblatex/biber):
|
||||
*.bbl
|
||||
*.bcf
|
||||
*.blg
|
||||
*-blx.aux
|
||||
*-blx.bib
|
||||
*.run.xml
|
||||
|
||||
## Build tool auxiliary files:
|
||||
*.fdb_latexmk
|
||||
*.synctex
|
||||
*.synctex(busy)
|
||||
*.synctex.gz
|
||||
*.synctex.gz(busy)
|
||||
*.pdfsync
|
||||
|
||||
## Build tool directories for auxiliary files
|
||||
# latexrun
|
||||
latex.out/
|
||||
|
||||
## Auxiliary and intermediate files from other packages:
|
||||
# algorithms
|
||||
*.alg
|
||||
*.loa
|
||||
|
||||
# achemso
|
||||
acs-*.bib
|
||||
|
||||
# amsthm
|
||||
*.thm
|
||||
|
||||
# beamer
|
||||
*.nav
|
||||
*.pre
|
||||
*.snm
|
||||
*.vrb
|
||||
|
||||
# changes
|
||||
*.soc
|
||||
|
||||
# comment
|
||||
*.cut
|
||||
|
||||
# cprotect
|
||||
*.cpt
|
||||
|
||||
# elsarticle (documentclass of Elsevier journals)
|
||||
*.spl
|
||||
|
||||
# endnotes
|
||||
*.ent
|
||||
|
||||
*.lox
|
||||
|
||||
# feynmf/feynmp
|
||||
*.mf
|
||||
*.mp
|
||||
*.t[1-9]
|
||||
*.t[1-9][0-9]
|
||||
*.tfm
|
||||
|
||||
#(r)(e)ledmac/(r)(e)ledpar
|
||||
*.end
|
||||
*.?end
|
||||
*.[1-9]
|
||||
*.[1-9][0-9]
|
||||
*.[1-9][0-9][0-9]
|
||||
*.[1-9]R
|
||||
*.[1-9][0-9]R
|
||||
*.[1-9][0-9][0-9]R
|
||||
*.eledsec[1-9]
|
||||
*.eledsec[1-9]R
|
||||
*.eledsec[1-9][0-9]
|
||||
*.eledsec[1-9][0-9]R
|
||||
*.eledsec[1-9][0-9][0-9]
|
||||
*.eledsec[1-9][0-9][0-9]R
|
||||
|
||||
# glossaries
|
||||
*.acn
|
||||
*.acr
|
||||
*.glg
|
||||
*.glo
|
||||
*.gls
|
||||
*.glsdefs
|
||||
*.lzo
|
||||
*.lzs
|
||||
*.slg
|
||||
*.slo
|
||||
*.sls
|
||||
|
||||
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
|
||||
# *.ist
|
||||
|
||||
# gnuplot
|
||||
*.gnuplot
|
||||
*.table
|
||||
|
||||
# gnuplottex
|
||||
*-gnuplottex-*
|
||||
|
||||
# gregoriotex
|
||||
*.gaux
|
||||
*.glog
|
||||
*.gtex
|
||||
|
||||
# htlatex
|
||||
*.4ct
|
||||
*.4tc
|
||||
*.idv
|
||||
*.lg
|
||||
*.trc
|
||||
*.xref
|
||||
|
||||
# hyperref
|
||||
*.brf
|
||||
|
||||
# knitr
|
||||
*-concordance.tex
|
||||
# *.tikz
|
||||
*-tikzDictionary
|
||||
|
||||
# listings
|
||||
*.lol
|
||||
|
||||
# luatexja-ruby
|
||||
*.ltjruby
|
||||
|
||||
# makeidx
|
||||
*.idx
|
||||
*.ilg
|
||||
*.ind
|
||||
|
||||
# minitoc
|
||||
*.maf
|
||||
*.mlf
|
||||
*.mlt
|
||||
*.mtc[0-9]*
|
||||
*.slf[0-9]*
|
||||
*.slt[0-9]*
|
||||
*.stc[0-9]*
|
||||
|
||||
# minted
|
||||
_minted*
|
||||
*.pyg
|
||||
|
||||
# morewrites
|
||||
*.mw
|
||||
|
||||
# newpax
|
||||
*.newpax
|
||||
|
||||
# nomencl
|
||||
*.nlg
|
||||
*.nlo
|
||||
*.nls
|
||||
|
||||
# pax
|
||||
*.pax
|
||||
|
||||
# pdfpcnotes
|
||||
*.pdfpc
|
||||
|
||||
# sagetex
|
||||
*.sagetex.sage
|
||||
*.sagetex.py
|
||||
*.sagetex.scmd
|
||||
|
||||
# scrwfile
|
||||
*.wrt
|
||||
|
||||
# svg
|
||||
svg-inkscape/
|
||||
|
||||
# sympy
|
||||
*.sout
|
||||
*.sympy
|
||||
sympy-plots-for-*.tex/
|
||||
|
||||
# pdfcomment
|
||||
*.upa
|
||||
*.upb
|
||||
|
||||
# pythontex
|
||||
*.pytxcode
|
||||
pythontex-files-*/
|
||||
|
||||
# tcolorbox
|
||||
*.listing
|
||||
|
||||
# thmtools
|
||||
*.loe
|
||||
|
||||
# TikZ & PGF
|
||||
*.dpth
|
||||
*.md5
|
||||
*.auxlock
|
||||
|
||||
# titletoc
|
||||
*.ptc
|
||||
|
||||
# todonotes
|
||||
*.tdo
|
||||
|
||||
# vhistory
|
||||
*.hst
|
||||
*.ver
|
||||
|
||||
*.lod
|
||||
|
||||
# xcolor
|
||||
*.xcp
|
||||
|
||||
# xmpincl
|
||||
*.xmpi
|
||||
|
||||
# xindy
|
||||
*.xdy
|
||||
|
||||
# xypic precompiled matrices and outlines
|
||||
*.xyc
|
||||
*.xyd
|
||||
|
||||
# endfloat
|
||||
*.ttt
|
||||
*.fff
|
||||
|
||||
# Latexian
|
||||
TSWLatexianTemp*
|
||||
|
||||
## Editors:
|
||||
# WinEdt
|
||||
*.bak
|
||||
*.sav
|
||||
|
||||
# Texpad
|
||||
.texpadtmp
|
||||
|
||||
# LyX
|
||||
*.lyx~
|
||||
|
||||
# Kile
|
||||
*.backup
|
||||
|
||||
# gummi
|
||||
.*.swp
|
||||
|
||||
# KBibTeX
|
||||
*~[0-9]*
|
||||
|
||||
# TeXnicCenter
|
||||
*.tps
|
||||
|
||||
# auto folder when using emacs and auctex
|
||||
./auto/*
|
||||
*.el
|
||||
|
||||
# expex forward references with \gathertags
|
||||
*-tags.tex
|
||||
|
||||
# standalone packages
|
||||
*.sta
|
||||
|
||||
# Makeindex log files
|
||||
*.lpz
|
||||
|
||||
# xwatermark package
|
||||
*.xwm
|
||||
|
||||
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
|
||||
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
|
||||
# Uncomment the next line to have this generated file ignored.
|
||||
#*Notes.bib
|
75
README.md
75
README.md
|
@ -1,14 +1,85 @@
|
|||
# Project 02: Multi-source code search
|
||||
|
||||
**Claudio Maggioni**
|
||||
|
||||
### About the Project
|
||||
|
||||
This project has the goal of developing a search engine able to query a large Python code repository using multiple sources of information.
|
||||
This project has the goal of developing a search engine able to query a large Python code repository using multiple
|
||||
sources of information.
|
||||
It is part of the Knowledge Analysis & Management - 2022 course from the Università della Svizzera italiana.
|
||||
|
||||
In this repository, you can find the following files:
|
||||
|
||||
- tensor flow: a code repository to be used during this project
|
||||
- ground-truth-unique: a file containing the references triples necessary to evaluate the search engine (step 3)
|
||||
- ground-truth-unique: a file containing the references triples necessary to evaluate the search engine (step 3)
|
||||
|
||||
For more information, see the Project-02 slides (available on iCourse)
|
||||
|
||||
Note: Feel free to modify this file according to the project's necessities.
|
||||
|
||||
## Environment setup
|
||||
|
||||
To install the required dependencies make sure `python3` points to a Python 3.10 or 3.11 installation and then run:
|
||||
|
||||
```shell
|
||||
python3 -m venv env
|
||||
source env/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Part 1: data extraction
|
||||
|
||||
To extract the data in file `data.csv` run the command:
|
||||
|
||||
```shell
|
||||
python3 extract-data.py
|
||||
```
|
||||
|
||||
The script prints the requested counts, which are namely:
|
||||
|
||||
```
|
||||
Methods: 5817
|
||||
Functions: 4565
|
||||
Classes: 1882
|
||||
Python Files: 2817
|
||||
```
|
||||
|
||||
## Part 2: Training
|
||||
|
||||
In order to train and predict the output of a given query run the command:
|
||||
|
||||
```shell
|
||||
python3 search-data.py [method] "[query]"
|
||||
```
|
||||
|
||||
where `[method]` is one of `{tfidf,freq,lsi,doc2vec}` or `all` to run all classifiers and `[query]` is the natural
|
||||
language query to search. Outputs are printed on stdout, and in case of `doc2vec` the trained model file is saved in
|
||||
`./doc2vec_model.dat` and fetched in this path for subsequent executions.
|
||||
|
||||
## Part 3: Evaluation
|
||||
|
||||
To evaluate a model run the command:
|
||||
|
||||
```shell
|
||||
python3 search-data.py [method] ./ground-truth-unique.txt
|
||||
```
|
||||
|
||||
where `[method]` is one of `{tfidf,freq,lsi,doc2vec}` or `all` to evaluate all classifiers. The script outputs the
|
||||
performance of the classifiers in terms of average precision and recall, which are namely:
|
||||
|
||||
| Engine | Average Precision | Average Recall |
|
||||
|:---------|:--------------------|:-----------------|
|
||||
| tfidf | 90.00% | 90.00% |
|
||||
| freq | 93.33% | 100.00% |
|
||||
| lsi | 90.00% | 90.00% |
|
||||
| doc2vec | 73.33% | 80.00% |
|
||||
|
||||
## Report
|
||||
|
||||
To compile the report run:
|
||||
|
||||
```shell
|
||||
cd report
|
||||
pdflatex -interaction=nonstopmode -output-directory=. main.tex
|
||||
pdflatex -interaction=nonstopmode -output-directory=. main.tex
|
||||
```
|
Binary file not shown.
|
@ -15,7 +15,7 @@ def find_py_files(dir):
|
|||
|
||||
|
||||
def keep_name(name):
|
||||
return not name.startswith("_") and not "main" in str(name).lower() and \
|
||||
return not name.startswith("_") and "main" not in str(name).lower() and \
|
||||
"test" not in str(name).lower()
|
||||
|
||||
|
||||
|
@ -28,7 +28,7 @@ class FeatureVisitor(ast.NodeVisitor):
|
|||
def visit_FunctionDef(self, node):
|
||||
if keep_name(node.name):
|
||||
self.rows.append({
|
||||
"name": node.name,
|
||||
"name": node.name,
|
||||
"file": self.filename,
|
||||
"line": node.lineno,
|
||||
"type": "function",
|
||||
|
@ -56,14 +56,14 @@ class FeatureVisitor(ast.NodeVisitor):
|
|||
})
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
df = pd.DataFrame(columns=["name", "file", "line", "type", "comment"])
|
||||
|
||||
for file in find_py_files(IN_DIR):
|
||||
files = list(find_py_files(IN_DIR))
|
||||
|
||||
for file in files:
|
||||
with open(file, "r") as f:
|
||||
py_source = f.read()
|
||||
|
||||
|
||||
py_ast = ast.parse(py_source)
|
||||
|
||||
visitor = FeatureVisitor(file)
|
||||
|
@ -71,6 +71,16 @@ def main():
|
|||
df_visitor = pd.DataFrame.from_records(visitor.rows)
|
||||
df = pd.concat([df, df_visitor])
|
||||
|
||||
counts = df["type"].apply(lambda ft: {
|
||||
"function": "Functions",
|
||||
"class": "Classes",
|
||||
"method": "Methods"
|
||||
}[ft]).value_counts().to_dict()
|
||||
counts["Python Files"] = len(files)
|
||||
|
||||
for file_type, name in counts.items():
|
||||
print(f"{file_type}: {name}")
|
||||
|
||||
df.reset_index(drop=True).to_csv(OUT_FILE)
|
||||
|
||||
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 76 KiB |
|
@ -0,0 +1,2 @@
|
|||
Precision: 73.33%
|
||||
Recall: 80.00%
|
|
@ -0,0 +1,2 @@
|
|||
Precision: 93.33%
|
||||
Recall: 100.00%
|
Binary file not shown.
After Width: | Height: | Size: 72 KiB |
|
@ -0,0 +1,2 @@
|
|||
Precision: 90.00%
|
||||
Recall: 90.00%
|
|
@ -0,0 +1,2 @@
|
|||
Precision: 90.00%
|
||||
Recall: 90.00%
|
|
@ -0,0 +1,149 @@
|
|||
import argparse
|
||||
import os.path
|
||||
from typing import Iterable, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import tqdm
|
||||
from matplotlib import pyplot as plt
|
||||
from sklearn.manifold import TSNE
|
||||
|
||||
search_data = __import__('search-data')
|
||||
|
||||
TENSORFLOW_PATH_PREFIX: str = "./"
|
||||
OUT_DIR: str = os.path.join(os.path.dirname(__file__), "out")
|
||||
|
||||
|
||||
def read_ground_truth(file_path: str, df: pd.DataFrame) -> Iterable[tuple[str, int]]:
|
||||
records: list[list[str]] = []
|
||||
|
||||
with open(file_path) as f:
|
||||
record_tmp = []
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line == '':
|
||||
assert len(record_tmp) == 3
|
||||
records.append(record_tmp)
|
||||
record_tmp = []
|
||||
else:
|
||||
record_tmp.append(line)
|
||||
|
||||
if len(record_tmp) == 3:
|
||||
records.append(record_tmp)
|
||||
|
||||
for query, name, file_name in records:
|
||||
assert file_name.startswith(TENSORFLOW_PATH_PREFIX)
|
||||
file_name = file_name[len(TENSORFLOW_PATH_PREFIX):]
|
||||
|
||||
row = df[(df.name == name) & (df.file == file_name)]
|
||||
assert len(row) == 1
|
||||
|
||||
yield query, row.index[0]
|
||||
|
||||
|
||||
def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
|
||||
for i, le in enumerate(li):
|
||||
if le[0] == e:
|
||||
return i
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def plot_df(results, query: str) -> Optional[pd.DataFrame]:
|
||||
if results.vectors is not None and results.query_vector is not None:
|
||||
tsne_vectors = np.array(results.vectors + [results.query_vector])
|
||||
tsne = TSNE(n_components=2, perplexity=2, n_iter=3000)
|
||||
tsne_results = tsne.fit_transform(tsne_vectors)
|
||||
df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'Query', 'Vector kind'])
|
||||
df['tsne-2d-one'] = tsne_results[:, 0]
|
||||
df['tsne-2d-two'] = tsne_results[:, 1]
|
||||
df['Query'] = [query] * (len(results.vectors) + 1)
|
||||
df['Vector kind'] = (['Result'] * len(results.vectors)) + ['Input query']
|
||||
return df
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def evaluate(method_name: str, file_path: str) -> tuple[float, float]:
|
||||
df = search_data.load_data()
|
||||
test_set = list(read_ground_truth(file_path, df))
|
||||
|
||||
precision_sum = 0
|
||||
recall_sum = 0
|
||||
|
||||
dfs = []
|
||||
|
||||
for query, expected in tqdm.tqdm(test_set):
|
||||
search_results = search_data.search(query, method_name, df)
|
||||
|
||||
df_q = plot_df(search_results, query)
|
||||
if df_q is not None:
|
||||
dfs.append(df_q)
|
||||
|
||||
idx = better_index(search_results.indexes_scores, expected)
|
||||
|
||||
if idx is None:
|
||||
precision = 0
|
||||
recall = 0
|
||||
else:
|
||||
precision = 1 / (idx + 1)
|
||||
recall = 1
|
||||
|
||||
precision_sum += precision
|
||||
recall_sum += recall
|
||||
|
||||
if not os.path.isdir(OUT_DIR):
|
||||
os.makedirs(OUT_DIR)
|
||||
|
||||
precision = precision_sum * 100 / len(test_set)
|
||||
recall = recall_sum * 100 / len(test_set)
|
||||
|
||||
output = "Precision: {0:.2f}%\nRecall: {1:.2f}%\n".format(precision, recall)
|
||||
|
||||
print(output)
|
||||
with open(os.path.join(OUT_DIR, "{0}_prec_recall.txt".format(method_name)), "w") as f:
|
||||
f.write(output)
|
||||
|
||||
if len(dfs) > 0:
|
||||
df = pd.concat(dfs)
|
||||
plt.figure(figsize=(12, 10))
|
||||
sns.scatterplot(
|
||||
x="tsne-2d-one", y="tsne-2d-two",
|
||||
hue="Query",
|
||||
style="Vector kind",
|
||||
palette=sns.color_palette("husl", n_colors=10),
|
||||
data=df,
|
||||
legend="full",
|
||||
alpha=1.0
|
||||
)
|
||||
plt.savefig(os.path.join(OUT_DIR, "{0}_plot.png".format(method_name)))
|
||||
|
||||
return precision, recall
|
||||
|
||||
|
||||
def main():
|
||||
methods = ["tfidf", "freq", "lsi", "doc2vec"]
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("method", help="the method to compare similarities with", type=str, choices=methods + ["all"])
|
||||
parser.add_argument("ground_truth_file", help="file where ground truth comes from", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.method == "all":
|
||||
df = pd.DataFrame(columns=["Engine", "Average Precision", "Average Recall"])
|
||||
|
||||
for i, method in enumerate(methods):
|
||||
print(f"Applying method {method}:")
|
||||
precision, recall = evaluate(method, args.ground_truth_file)
|
||||
df.loc[i, "Engine"] = method
|
||||
df.loc[i, "Average Precision"] = f"{precision:.2f}%"
|
||||
df.loc[i, "Average Recall"] = f"{recall:.2f}%"
|
||||
|
||||
print(df.to_markdown(index=False))
|
||||
else:
|
||||
evaluate(args.method, args.ground_truth_file)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Binary file not shown.
|
@ -0,0 +1,297 @@
|
|||
%!TEX TS-program = pdflatexmk
|
||||
\documentclass{article}
|
||||
|
||||
\usepackage{algorithm}
|
||||
\usepackage{textcomp}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{soul}
|
||||
\usepackage{booktabs}
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{microtype}
|
||||
\usepackage{rotating}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{paralist}
|
||||
\usepackage{tabularx}
|
||||
\usepackage{multicol}
|
||||
\usepackage{multirow}
|
||||
\usepackage{pbox}
|
||||
\usepackage{enumitem}
|
||||
\usepackage{colortbl}
|
||||
\usepackage{pifont}
|
||||
\usepackage{xspace}
|
||||
\usepackage{url}
|
||||
\usepackage{tikz}
|
||||
\usepackage{fontawesome}
|
||||
\usepackage{lscape}
|
||||
\usepackage{listings}
|
||||
\usepackage{color}
|
||||
\usepackage{anyfontsize}
|
||||
\usepackage{comment}
|
||||
\usepackage{multibib}
|
||||
\usepackage{float}
|
||||
\usepackage{caption}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{amssymb}
|
||||
\usepackage{amsmath}
|
||||
\usepackage{changepage}
|
||||
\usepackage{hyperref}
|
||||
|
||||
\title{Knowledge Management and Analysis \\ Project 01: Code Search}
|
||||
\author{Claudio Maggioni}
|
||||
\date{}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\maketitle
|
||||
|
||||
\begin{adjustwidth}{-4cm}{-4cm}
|
||||
\centering
|
||||
\begin{tabular}{cc}
|
||||
\toprule
|
||||
Repository URL & \url{https://github.com/kamclassroom2022/project-01-multi-search-maggicl} \\
|
||||
Commit ID & \texttt{b8e0a2c3c41249e45b233b55607e0b04ebe1bad0} \\ \bottomrule
|
||||
\end{tabular}
|
||||
\end{adjustwidth}
|
||||
\vspace{1cm}
|
||||
|
||||
|
||||
\subsection*{Section 1 - Data Extraction}
|
||||
|
||||
The data extraction (implemented in the script \texttt{extract-data.py}) process scans through the files in the
|
||||
TensorFlow project to extract Python docstrings and symbol names for functions, classes and methods. A summary of the
|
||||
number of features extracted can be found in table~\ref{tab:count1}. The collected figures show that the number of
|
||||
classes is more than half the number of files, while the number of functions is about twice the number of files.
|
||||
Additionally, the data shows that a class has slightly more than 2 methods in it on average.
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\begin{tabular}{cc}
|
||||
\toprule
|
||||
Type & Number \\
|
||||
\midrule
|
||||
Python files & 2817 \\
|
||||
Classes & 1882 \\
|
||||
Functions & 4565 \\
|
||||
Methods & 5817 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Count of created classes and properties.}
|
||||
\label{tab:count1}
|
||||
\end{table}
|
||||
|
||||
\subsection*{Section 2: Training of search engines}
|
||||
|
||||
The training and model execution of the search engines is implemented in the Python script \texttt{search-data.py}.
|
||||
The training model loads the data extracted by \texttt{extract-data.py} and uses as classification features the
|
||||
identifier name and only the first line of the comment docstring. All other comment lines are filtered out as this
|
||||
significantly increases performance when evaluating the models.
|
||||
|
||||
The script is able to search a given natural language query among the extracted TensorFlow corpus using four techniques.
|
||||
These are namely: Word Frequency Similarity, Term-Frequency Inverse Document-Frequency (TF-IDF) Similarity, Latent
|
||||
Semantic Indexing (LSI), and Doc2Vec.
|
||||
|
||||
An example output of results generated from the query ``Gather gpu device info'' for the word frequency, TF-IDF, LSI
|
||||
and Doc2Vec models are shown in
|
||||
figures~\ref{fig:search-freq},~\ref{fig:search-tfidf},~\ref{fig:search-lsi}~and~\ref{fig:search-doc2vec} respectively.
|
||||
All four models are able to correctly report the ground truth required by the file \texttt{ground-truth-unique.txt} as
|
||||
the first result with $>90\%$ similarity, with the except of the Doc2Vec model which reports $71.63\%$ similarity.
|
||||
|
||||
\begin{figure}[b]
|
||||
\small
|
||||
\begin{verbatim}
|
||||
Similarity: 90.45%
|
||||
Python function: gather_gpu_devices
|
||||
Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
|
||||
File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
|
||||
Line: 167
|
||||
|
||||
Similarity: 57.74%
|
||||
Python function: gather_memory_info
|
||||
Description: Gather memory info.
|
||||
File: tensorflow/tensorflow/tools/test/system_info_lib.py
|
||||
Line: 70
|
||||
|
||||
Similarity: 57.74%
|
||||
Python function: gather_platform_info
|
||||
Description: Gather platform info.
|
||||
File: tensorflow/tensorflow/tools/test/system_info_lib.py
|
||||
Line: 146
|
||||
|
||||
Similarity: 55.47%
|
||||
Python function: compute_capability_from_device_desc
|
||||
Description: Returns the GpuInfo given a DeviceAttributes proto. Args: devi...
|
||||
File: tensorflow/tensorflow/python/framework/gpu_util.py
|
||||
Line: 35
|
||||
|
||||
Similarity: 55.47%
|
||||
Python function: gather_available_device_info
|
||||
Description: Gather list of devices available to TensorFlow. Returns: A lis...
|
||||
File: tensorflow/tensorflow/tools/test/system_info_lib.py
|
||||
Line: 126
|
||||
\end{verbatim}
|
||||
\caption{Search result output for the query ``Gather gpu device info'' using the word frequency similarity model.}
|
||||
\label{fig:search-freq}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[b]
|
||||
\small
|
||||
\begin{verbatim}
|
||||
Similarity: 90.95%
|
||||
Python function: gather_gpu_devices
|
||||
Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
|
||||
File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
|
||||
Line: 167
|
||||
|
||||
Similarity: 59.12%
|
||||
Python function: gather_memory_info
|
||||
Description: Gather memory info.
|
||||
File: tensorflow/tensorflow/tools/test/system_info_lib.py
|
||||
Line: 70
|
||||
|
||||
Similarity: 56.40%
|
||||
Python function: gather_available_device_info
|
||||
Description: Gather list of devices available to TensorFlow. Returns: A lis...
|
||||
File: tensorflow/tensorflow/tools/test/system_info_lib.py
|
||||
Line: 126
|
||||
|
||||
Similarity: 55.25%
|
||||
Python function: gather_platform_info
|
||||
Description: Gather platform info.
|
||||
File: tensorflow/tensorflow/tools/test/system_info_lib.py
|
||||
Line: 146
|
||||
|
||||
Similarity: 53.97%
|
||||
Python function: info
|
||||
File: tensorflow/tensorflow/python/platform/tf_logging.py
|
||||
Line: 167
|
||||
\end{verbatim}
|
||||
\caption{Search result output for the query ``Gather gpu device info'' using the TF-IDF model.}
|
||||
\label{fig:search-tfidf}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[b]
|
||||
\small
|
||||
\begin{verbatim}
|
||||
Similarity: 98.38%
|
||||
Python function: gather_gpu_devices
|
||||
Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
|
||||
File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
|
||||
Line: 167
|
||||
|
||||
Similarity: 97.66%
|
||||
Python function: device
|
||||
Description: Uses gpu when requested and available.
|
||||
File: tensorflow/tensorflow/python/framework/test_util.py
|
||||
Line: 1581
|
||||
|
||||
Similarity: 97.66%
|
||||
Python function: device
|
||||
Description: Uses gpu when requested and available.
|
||||
File: tensorflow/tensorflow/python/keras/testing_utils.py
|
||||
Line: 925
|
||||
|
||||
Similarity: 96.79%
|
||||
Python class: ParallelDevice
|
||||
Description: A device which executes operations in parallel.
|
||||
File: tensorflow/tensorflow/python/distribute/parallel_device/parallel_device.py
|
||||
Line: 42
|
||||
|
||||
Similarity: 96.67%
|
||||
Python method: get_var_on_device
|
||||
File: tensorflow/tensorflow/python/distribute/packed_distributed_variable.py
|
||||
Line: 90
|
||||
\end{verbatim}
|
||||
\caption{Search result output for the query ``Gather gpu device info'' using the LSI model.}
|
||||
\label{fig:search-lsi}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[b]
|
||||
\small
|
||||
\begin{verbatim}
|
||||
Similarity: 71.63%
|
||||
Python function: gather_gpu_devices
|
||||
Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf...
|
||||
File: tensorflow/tensorflow/tools/test/gpu_info_lib.py
|
||||
Line: 167
|
||||
|
||||
Similarity: 66.71%
|
||||
Python function: device
|
||||
Description: Uses gpu when requested and available.
|
||||
File: tensorflow/tensorflow/python/keras/testing_utils.py
|
||||
Line: 925
|
||||
|
||||
Similarity: 65.23%
|
||||
Python function: gpu_device_name
|
||||
Description: Returns the name of a GPU device if available or the empty str...
|
||||
File: tensorflow/tensorflow/python/framework/test_util.py
|
||||
Line: 129
|
||||
|
||||
Similarity: 64.33%
|
||||
Python function: gather_available_device_info
|
||||
Description: Gather list of devices available to TensorFlow. Returns: A lis...
|
||||
File: tensorflow/tensorflow/tools/test/system_info_lib.py
|
||||
Line: 126
|
||||
|
||||
Similarity: 64.29%
|
||||
Python method: hosts
|
||||
Description: A list of device names for CPU hosts. Returns: A list of devic...
|
||||
File: tensorflow/tensorflow/python/tpu/tpu_embedding.py
|
||||
Line: 1011
|
||||
\end{verbatim}
|
||||
\caption{Search result output for the query ``Gather gpu device info'' using the Doc2Vec model.}
|
||||
\label{fig:search-doc2vec}
|
||||
\end{figure}
|
||||
|
||||
\subsection*{Section 3: Evaluation of search engines}
|
||||
|
||||
The evaluation over the given ground truth to compute precision, recall, and the T-SNE plots is performed by the script
|
||||
\texttt{prec-recall.py}. The calculated average precision and recall values are reported in table~\ref{tab:tab2}.
|
||||
|
||||
Precision and recall are quite high for all models.
|
||||
The word frequency model has the highest precision and recall ($93.33\%$ and $100.00\%$ respectively), while the Doc2Vec
|
||||
model has the lowest precision ($73.33\%$) and lowest recall ($80.00\%$).
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\begin{tabular}{ccc}
|
||||
\toprule
|
||||
Engine & Avg Precision & Recall \\
|
||||
\midrule
|
||||
Frequencies & 93.33\% & 100.00\% \\
|
||||
TD-IDF & 90.00\% & 90.00\% \\
|
||||
LSI & 90.00\% & 90.00\% \\
|
||||
Doc2Vec & 73.33\% & 80.00\% \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Evaluation of search engines.}
|
||||
\label{tab:tab2}
|
||||
\end{table}
|
||||
|
||||
\subsection*{TBD Section 4: Visualisation of query results}
|
||||
|
||||
The two-dimensional T-SNE plots (computed with perplexity $= 2$) for the LSI and Doc2Vec models are respectively in
|
||||
figures~\ref{fig:tsne-lsi}~and~\ref{fig:tsne-doc2vec}.
|
||||
|
||||
The T-SNE plot for the LSI model shows evidently the presence of outliers in the search result. The Doc2Vec plot shows
|
||||
fewer outliers and more distinct clusters for the results of each query and the query vector itself. However, even
|
||||
considering the good performance for both models, it is hard to distinguish from the plots given distinct ``regions''
|
||||
where results and their respective query are located.
|
||||
|
||||
\begin{figure}
|
||||
\begin{center}
|
||||
\includegraphics[width=\textwidth]{../out/lsi_plot}
|
||||
\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
|
||||
\label{fig:tsne-lsi}
|
||||
\end{center}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}
|
||||
\begin{center}
|
||||
\includegraphics[width=\textwidth]{../out/doc2vec_plot}
|
||||
\caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
|
||||
\label{fig:tsne-doc2vec}
|
||||
\end{center}
|
||||
\end{figure}
|
||||
|
||||
\end{document}
|
|
@ -1,3 +1,9 @@
|
|||
nltk==3.8.1
|
||||
pandas==2.1.1
|
||||
coloredlogs==15.0.1
|
||||
gensim==4.3.2
|
||||
nltk==3.8.1
|
||||
numpy==1.26.1
|
||||
pandas==2.1.2
|
||||
tqdm==4.66.1
|
||||
scikit-learn==1.3.2
|
||||
seaborn==0.13.0
|
||||
tabulate==0.9.0
|
203
search-data.py
203
search-data.py
|
@ -1,109 +1,216 @@
|
|||
import re
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import pandas as pd
|
||||
import re
|
||||
import typing
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import coloredlogs
|
||||
import nltk
|
||||
import numpy as np
|
||||
from nltk.corpus import stopwords
|
||||
from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
|
||||
from gensim.models import TfidfModel, LsiModel, LdaModel
|
||||
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
|
||||
import pandas as pd
|
||||
from gensim.corpora import Dictionary
|
||||
from collections import defaultdict
|
||||
from gensim.models import TfidfModel, LsiModel
|
||||
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
|
||||
from gensim.similarities import SparseMatrixSimilarity
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
nltk.download('stopwords')
|
||||
nltk.download('stopwords', quiet=True)
|
||||
|
||||
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
|
||||
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
|
||||
DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
|
||||
|
||||
# using ntlk stop words and example words for now
|
||||
# using nltk stop words and example words for now
|
||||
STOP_WORDS = set(stopwords.words('english')) \
|
||||
.union(['test', 'tests', 'main', 'this'])
|
||||
.union(['test', 'tests', 'main', 'this', 'self', 'int', 'get', 'set', 'new', 'return', 'list'])
|
||||
|
||||
|
||||
def find_all(regex, word):
|
||||
def find_all(regex: str, word: str, lower=True) -> list[str]:
|
||||
matches = re.finditer(regex, word)
|
||||
return [m.group(0).lower() for m in matches]
|
||||
return [m.group(0).lower() if lower else m.group(0) for m in matches]
|
||||
|
||||
|
||||
# https://stackoverflow.com/a/29920015
|
||||
def camel_case_split(word):
|
||||
def camel_case_split(word: str) -> list[str]:
|
||||
return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
|
||||
|
||||
|
||||
def identifier_split(identifier):
|
||||
def identifier_split(identifier: str) -> list[str]:
|
||||
return [y for x in identifier.split("_") for y in camel_case_split(x)]
|
||||
|
||||
|
||||
def comment_split(comment):
|
||||
return find_all('[A-Za-z0-9]+', comment)
|
||||
def comment_split(comment: Optional[float | str], is_comment=True) -> list[str]:
|
||||
if (type(comment) == float and np.isnan(comment)) or comment is None:
|
||||
return []
|
||||
|
||||
# Consider only first line of each comment. Increases performance significantly
|
||||
if is_comment:
|
||||
comment = str(comment).split("\n", maxsplit=2)[0]
|
||||
|
||||
# Camel case split within "words" found takes care of referenced type names in the docstring comment
|
||||
return [s for word in find_all('[A-Za-z]+', comment, lower=False) for s in camel_case_split(word)]
|
||||
|
||||
|
||||
def remove_stopwords(input_bow_list):
|
||||
return [word for word in input_bow_list if word not in STOP_WORDS]
|
||||
def remove_stopwords(input_bow_list: list[str]) -> list[str]:
|
||||
return [word for word in input_bow_list if word not in STOP_WORDS and len(word) > 2]
|
||||
|
||||
|
||||
def get_bow(data, split_f):
|
||||
def get_bow(data: Optional[float | str], split_f) -> list[str]:
|
||||
if data is None or (type(data) == float and np.isnan(data)):
|
||||
return []
|
||||
return remove_stopwords(split_f(data))
|
||||
|
||||
|
||||
def print_sims(corpus, query, df, dictionary):
|
||||
def pick_most_similar(corpus, query, dictionary) -> list[tuple[int, float]]:
|
||||
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
|
||||
sims = index[query]
|
||||
pick_top = 5
|
||||
|
||||
print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top])
|
||||
return sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]
|
||||
|
||||
def print_results(idxs_scores, df):
|
||||
for idx, score in idxs_scores:
|
||||
|
||||
def print_results(indexes_scores: list[tuple[int, float]], df):
|
||||
print("\n===== RESULTS: =====")
|
||||
|
||||
for idx, score in indexes_scores:
|
||||
row = df.loc[idx]
|
||||
print("Similarity: {s:2.02f}%".format(s=score*100))
|
||||
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
|
||||
.format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"]))
|
||||
|
||||
comment = row["comment"]
|
||||
if type(comment) != str:
|
||||
desc = ""
|
||||
else:
|
||||
comment = re.sub(re.compile(r'[\s\n]+', re.MULTILINE), ' ', comment)
|
||||
desc = "Description: {c}\n".format(c=comment)
|
||||
desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
|
||||
|
||||
print("\nSimilarity: {s:2.02f}%".format(s=score * 100))
|
||||
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}"
|
||||
.format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))
|
||||
|
||||
|
||||
def search(query, method):
|
||||
df = pd.read_csv(IN_DATASET)
|
||||
def train_doc2vec(corpus_list):
|
||||
dvdocs = [TaggedDocument(text, [i]) for i, text in enumerate(corpus_list)]
|
||||
model = Doc2Vec(vector_size=300, epochs=50, sample=0)
|
||||
model.build_vocab(dvdocs)
|
||||
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
|
||||
model.save(DOC2VEC_MODEL)
|
||||
return model
|
||||
|
||||
|
||||
def load_data(print_frequent=False) -> pd.DataFrame:
|
||||
df = pd.read_csv(IN_DATASET, index_col=0)
|
||||
df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split))
|
||||
df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split))
|
||||
|
||||
if print_frequent:
|
||||
freq = defaultdict(int)
|
||||
for bow in df["name_bow"].tolist():
|
||||
for i in bow:
|
||||
freq[i] += 1
|
||||
|
||||
for bow in df["comment_bow"].tolist():
|
||||
for i in bow:
|
||||
freq[i] += 1
|
||||
|
||||
for key, value in sorted(freq.items(), key=lambda k: k[1], reverse=True)[:100]:
|
||||
print(f"{value}: {key}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
SparseVector = list[tuple[int, float]]
|
||||
DenseVector = np.array
|
||||
|
||||
|
||||
def to_dense(vector: SparseVector) -> DenseVector:
|
||||
dense = [0.0] * len(vector)
|
||||
for idx, value in vector:
|
||||
dense[idx] = value
|
||||
return np.array(dense)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResults:
|
||||
indexes_scores: list[tuple[int, float]]
|
||||
vectors: Optional[list[DenseVector]]
|
||||
query_vector: Optional[DenseVector]
|
||||
|
||||
def __init__(self, indexes_values: list[tuple[int, float]], vectors: Optional[list[DenseVector]],
|
||||
query_vector: Optional[DenseVector]):
|
||||
self.indexes_scores = indexes_values
|
||||
self.vectors = vectors
|
||||
self.query_vector = query_vector
|
||||
|
||||
|
||||
def search(query: str, method: str, df: pd.DataFrame) -> SearchResults:
|
||||
corpus_list = []
|
||||
for idx, row in df.iterrows():
|
||||
for _, row in df.iterrows():
|
||||
document_words = row["name_bow"] + row["comment_bow"]
|
||||
corpus_list.append(document_words)
|
||||
|
||||
dictionary = Dictionary(corpus_list)
|
||||
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
|
||||
query_w = get_bow(query, comment_split)
|
||||
query_bow = dictionary.doc2bow(query_w)
|
||||
|
||||
query_w = comment_split(query, is_comment=False)
|
||||
dictionary = None
|
||||
corpus_bow = None
|
||||
query_bow = None
|
||||
|
||||
if method != "doc2vec":
|
||||
dictionary = Dictionary(corpus_list)
|
||||
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
|
||||
query_bow = dictionary.doc2bow(query_w)
|
||||
|
||||
if method == "tfidf":
|
||||
tfidf = TfidfModel(corpus_bow)
|
||||
print_sims(tfidf[corpus_bow], tfidf[query_bow], df, dictionary)
|
||||
return SearchResults(pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary), None, None)
|
||||
elif method == "freq":
|
||||
print_sims(corpus_bow, query_bow, df, dictionary)
|
||||
return SearchResults(pick_most_similar(corpus_bow, query_bow, dictionary), None, None)
|
||||
elif method == "lsi":
|
||||
lsi = LsiModel(corpus_bow)
|
||||
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
|
||||
lsi = LsiModel(corpus_bow, num_topics=50)
|
||||
corpus = typing.cast(list[SparseVector], lsi[corpus_bow])
|
||||
results = pick_most_similar(corpus, lsi[query_bow], dictionary)
|
||||
result_vectors: list[DenseVector] = [to_dense(corpus[idx]) for idx, _ in results]
|
||||
return SearchResults(results, result_vectors, to_dense(lsi[query_bow]))
|
||||
elif method == "doc2vec":
|
||||
dvdocs = [TaggedDocument(bow, [i]) for i, bow in enumerate(corpus_bow)]
|
||||
model = Doc2Vec(vector_size=50, min_count=2, epochs=100)
|
||||
model.build_vocab(dvdocs)
|
||||
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
|
||||
dvquery = model.infer_vector(query_w)
|
||||
print_results(model.dv.most_similar([dvquery], topn=5), df)
|
||||
if os.path.exists(DOC2VEC_MODEL):
|
||||
model = Doc2Vec.load(DOC2VEC_MODEL)
|
||||
else:
|
||||
model = train_doc2vec(corpus_list)
|
||||
|
||||
dv_query = model.infer_vector(query_w)
|
||||
results = model.dv.most_similar([dv_query], topn=5)
|
||||
|
||||
result_vectors = [model.infer_vector(corpus_list[idx]) for idx, _ in results]
|
||||
return SearchResults(results, result_vectors, dv_query)
|
||||
else:
|
||||
raise Error("method unknown")
|
||||
raise ValueError("method unknown")
|
||||
|
||||
|
||||
def main():
|
||||
methods = ["tfidf", "freq", "lsi", "doc2vec"]
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("method", help="the method to compare similarities with", type=str)
|
||||
parser.add_argument("method", help="the method to compare similarities with", type=str,
|
||||
choices=methods + ["all"])
|
||||
parser.add_argument("query", help="the query to search the corpus with", type=str)
|
||||
parser.add_argument("-v", "--verbose", help="enable verbose logging", action='store_true')
|
||||
args = parser.parse_args()
|
||||
search(args.query, args.method)
|
||||
|
||||
if args.verbose:
|
||||
coloredlogs.install()
|
||||
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
|
||||
|
||||
df = load_data()
|
||||
|
||||
if args.method == "all":
|
||||
for method in methods:
|
||||
print(f"Applying method {method}:")
|
||||
results = search(args.query, method, df)
|
||||
print_results(results.indexes_scores, df)
|
||||
print()
|
||||
else:
|
||||
results = search(args.query, args.method, df)
|
||||
print_results(results.indexes_scores, df)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in New Issue