kse-01/extract-data.py

89 lines
2.5 KiB
Python

import ast
import pandas as pd
import os
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
IN_DIR = os.path.join(SCRIPT_DIR, "tensorflow")
OUT_FILE = os.path.join(SCRIPT_DIR, "data.csv")
def find_py_files(dir):
for (cwd, dirs, files) in os.walk(dir):
for file in files:
if file.endswith(".py"):
yield os.path.join(cwd, file)
def keep_name(name):
return not name.startswith("_") and "main" not in str(name).lower() and \
"test" not in str(name).lower()
class FeatureVisitor(ast.NodeVisitor):
def __init__(self, filename):
self.filename = os.path.relpath(filename, SCRIPT_DIR)
self.rows = []
def visit_FunctionDef(self, node):
if keep_name(node.name):
self.rows.append({
"name": node.name,
"file": self.filename,
"line": node.lineno,
"type": "function",
"comment": ast.get_docstring(node)
})
def visit_ClassDef(self, node):
if keep_name(node.name):
self.rows.append({
"name": node.name,
"file": self.filename,
"line": node.lineno,
"type": "class",
"comment": ast.get_docstring(node)
})
for nd in ast.walk(node):
if isinstance(nd, ast.FunctionDef):
if keep_name(nd.name):
self.rows.append({
"name": nd.name,
"file": self.filename,
"line": nd.lineno,
"type": "method",
"comment": ast.get_docstring(nd)
})
def main():
df = pd.DataFrame(columns=["name", "file", "line", "type", "comment"])
files = list(find_py_files(IN_DIR))
for file in files:
with open(file, "r") as f:
py_source = f.read()
py_ast = ast.parse(py_source)
visitor = FeatureVisitor(file)
visitor.visit(py_ast)
df_visitor = pd.DataFrame.from_records(visitor.rows)
df = pd.concat([df, df_visitor])
counts = df["type"].apply(lambda ft: {
"function": "Functions",
"class": "Classes",
"method": "Methods"
}[ft]).value_counts().to_dict()
counts["Python Files"] = len(files)
for file_type, name in counts.items():
print(f"{file_type}: {name}")
df.reset_index(drop=True).to_csv(OUT_FILE)
if __name__ == "__main__":
main()