mindsdb/tests/scripts/check_requirements.py at adding_azure_handler · projectcss/mindsdb · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
import glob
import re
import sys
import subprocess
import os
import json

pattern = '\=|~|>|<| |\n|#|\['  # noqa: W605


def get_requirements_from_file(path):
    """Takes a requirements file path and extracts only the package names from it"""

    with open(path, 'r') as main_f:
        reqs = [
            re.split(pattern, line)[0]
            for line in main_f.readlines()
            if re.split(pattern, line)[0]
        ]
    return reqs


MAIN_REQS_PATH = "requirements/requirements.txt"
DEV_REQS_PATH = "requirements/requirements-dev.txt"
TEST_REQS_PATH = "requirements/requirements-test.txt"

HANDLER_REQS_PATHS = list(
    set(glob.glob("**/requirements*.txt", recursive=True))
    - set(glob.glob("requirements/requirements*.txt"))
)

MAIN_EXCLUDE_PATHS = ["mindsdb/integrations/handlers/.*_handler", "pryproject.toml"]

# Torch.multiprocessing is imported in a 'try'. Falls back to multiprocessing so we dont NEED it.
# Psycopg2 is needed in core codebase for sqlalchemy.
# lark is required for auto retrieval (RAG utilities). It is used by langchain
# and not explicitly imported in mindsdb.
# transformers is required for langchain_core and not explicitly imported by mindsdb.
MAIN_RULE_IGNORES = {
    "DEP003": ["torch"],
    "DEP001": ["torch", "pgvector", "pyarrow", "openai"],
    "DEP002": ["psycopg2-binary", "lark", "transformers", "langchain-experimental", "lxml", "openpyxl"]
}


# The following packages need exceptions.
# Either because 1) they are optional deps of some other packages. E.g.:
#   - langchain CAN use openai
#   - pypdf and openpyxl are optional deps of langchain, that are used for the file handler
# Or 2) because they are imported in an unusual way. E.g.:
#   - pysqlite3 in the chromadb handler
#   - dspy-ai in langchain handler
OPTIONAL_HANDLER_DEPS = ["torch", "tiktoken", "wikipedia",
                         "sentence-transformers", "faiss-cpu", "litellm", "chromadb", "dspy-ai", "sqlalchemy-solr"]

# List of rules we can ignore for specific packages
# Here we ignore any packages in the main requirements.txt for "listed but not used" errors, because they will be used for the core code but not necessarily in a given handler
MAIN_REQUIREMENTS_DEPS = get_requirements_from_file(MAIN_REQS_PATH) + get_requirements_from_file(
    TEST_REQS_PATH)

BYOM_HANLDER_DEPS = ["pyarrow"]
# The `thrift-sasl` package is required establish a connection via to Hive via `pyhive`, but it is not explicitly imported in the code.
HIVE_HANDLER_DEPS = ["thrift-sasl"]

# The `gcsfs` package is required to interact with GCS as a file system.
GCS_HANDLER_DEPS = ["gcsfs"]

HANDLER_RULE_IGNORES = {
    "DEP002": OPTIONAL_HANDLER_DEPS + MAIN_REQUIREMENTS_DEPS + BYOM_HANLDER_DEPS + HIVE_HANDLER_DEPS + GCS_HANDLER_DEPS,
    "DEP001": ["tests", "pyarrow", "IfxPyDbi", "ingres_sa_dialect"]  # 'tests' is the mindsdb tests folder in the repo root, 'pyarrow' used in snowflake handler
}

PACKAGE_NAME_MAP = {
    "azure-storage-blob": ["azure"],
    "scylla-driver": ["cassandra"],
    "mysql-connector-python": ["mysql"],
    "snowflake-connector-python": ["snowflake"],
    "snowflake-sqlalchemy": ["snowflake"],
    "auto-sklearn": ["autosklearn"],
    "google-cloud-aiplatform": ["google"],
    "google-cloud-bigquery": ["google"],
    "google-cloud-spanner": ["google"],
    "sqlalchemy-spanner": ["google"],
    "google-auth-httplib2": ["google"],
    "google-generativeai": ["google"],
    "google-analytics-admin": ["google"],
    "google-auth": ["google"],
    "google-cloud-storage": ["google"],
    "protobuf": ["google"],
    "google-api-python-client": ["googleapiclient"],
    "ibm-cos-sdk": ["ibm_boto3", "ibm_botocore"],
    "binance-connector": ["binance"],
    "pysqlite3": ["pysqlite3"],
    "atlassian-python-api": ["atlassian"],
    "databricks-sql-connector": ["databricks"],
    "elasticsearch-dbapi": ["es"],
    "pygithub": ["github"],
    "python-gitlab": ["gitlab"],
    "impyla": ["impala"],
    "IfxPy": ["IfxPyDbi"],
    "salesforce-merlion": ["merlion"],
    "newsapi-python": ["newsapi"],
    "pinecone-client": ["pinecone"],
    "plaid-python": ["plaid"],
    "faiss-cpu": ["faiss"],
    "writerai": ["writer"],
    "rocketchat_API": ["rocketchat_API"],
    "ShopifyAPI": ["shopify"],
    "solace-pubsubplus": ["solace"],
    "taospy": ["taosrest"],
    "weaviate-client": ["weaviate"],
    "pymupdf": ["fitz"],
    "ibm-db": ["ibm_db_dbi"],
    "python-dateutil": ["dateutil"],
    "sqlalchemy-redshift": ["redshift_sqlalchemy"],
    "sqlalchemy-vertica-python": ["sqla_vertica_python"],
    "psycopg2-binary": ["psycopg2"],
    "psycopg-binary": ["psycopg"],
    "pymongo": ["pymongo", "bson"],
    "python-multipart": ["multipart"],
    "pydateinfer": ["dateinfer"],
    "scikit-learn": ["sklearn"],
    "influxdb3-python": ["influxdb_client_3"],
    "hubspot-api-client": ["hubspot"],
    "eventbrite-python": ["eventbrite"],
    "clickhouse-sqlalchemy": ["clickhouse_sqlalchemy"],
    "pillow": ["PIL"],
    "auto-ts": ["auto_ts"],
    "llama-index-readers-web": ["llama_index"],
    "llama-index-embeddings-openai": ["llama_index"],
    "unifyai": ["unify"],
    "botframework-connector": ["botframework"],
    "botbuilder-schema": ["botbuilder"],
    "opentelemetry-api": ["opentelemetry"],
    "opentelemetry-sdk": ["opentelemetry"],
    "opentelemetry-exporter-otlp": ["opentelemetry"],
    "opentelemetry-instrumentation-requests": ["opentelemetry"],
    "opentelemetry-instrumentation-flask": ["opentelemetry"],
    "opentelemetry-distro": ["opentelemetry"],
    "sqlalchemy-ingres": ["ingres_sa_dialect"],
    "pyaml": ["yaml"],
}

# We use this to exit with a non-zero status code if any check fails
# so that when this is running in CI the job will fail
success = True


def print_errors(file, errors):
    global success
    if len(errors) > 0:
        success = False
        print(f"- {file}")
        for line in errors:
            print("    " + line)
        print()


def get_ignores_str(ignores_dict):
    """Get a list of rule ignores for deptry"""

    return ",".join([f"{k}={'|'.join(v)}" for k, v in ignores_dict.items()])


def run_deptry(reqs, rule_ignores, path, extra_args=""):
    """Run a dependency check with deptry. Return a list of error messages"""

    errors = []
    try:
        result = subprocess.run(
            f"deptry -o deptry.json --no-ansi --known-first-party mindsdb --requirements-files \"{reqs}\" --per-rule-ignores \"{rule_ignores}\" --package-module-name-map \"{get_ignores_str(PACKAGE_NAME_MAP)}\" {extra_args} {path}",
            shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE
        )
        if result.returncode != 0 and not os.path.exists("deptry.json"):
            # There was some issue with running deptry
            errors.append(f"Error running deptry: {result.stderr.decode('utf-8')}")

        with open("deptry.json", "r") as f:
            deptry_results = json.loads(f.read())
        for r in deptry_results:
            errors.append(
                f"{r['location']['line']}:{r['location']['column']}: {r['error']['code']} {r['error']['message']}")
    finally:
        if os.path.exists("deptry.json"):
            os.remove("deptry.json")
    return errors


def check_for_requirements_duplicates():
    """Checks that handler requirements.txt and the main requirements.txt don't contain any of the same packages"""

    global success
    main_reqs = get_requirements_from_file(MAIN_REQS_PATH)

    for file in HANDLER_REQS_PATHS:
        handler_reqs = get_requirements_from_file(file)

        for req in handler_reqs:
            if req in main_reqs:
                print(f"{req} is in {file} and also in main requirements file.")
                success = False


def check_relative_reqs():
    """
    Check that relationships between handlers are defined correctly.

    If a parent handler imports another handler in code, we should define that dependency
    in the parent handler's requirements.txt like:

    -R mindsdb/integrations/handlers/child_handler/requirements.txt

    This is important to ensure that "pip install mindsdb[parent_handler]" works correctly.
    This function checks that for each handler imported from another handler, there is a
    corresponding entry in a requirements.txt.
    """

    global success
    # regex for finding relative imports of handlers like "from ..file_handler import FileHandler"
    # we're going to treat these as errors (and suggest using absolute imports instead)
    relative_import_pattern = re.compile("(?:\s|^)(?:from|import) \.\.\w+_handler")  # noqa: W605

    def get_relative_requirements(files):
        """Find entries in a requirements.txt that are including another requirements.txt"""
        entries = {}
        for file in files:
            with open(file, 'r') as fh:
                for line in fh.readlines():
                    line = line.lower().strip()
                    if line.startswith("-r mindsdb/integrations/handlers/"):
                        entries[line.split("mindsdb/integrations/handlers/")[1].split("/")[0]] = line

        return entries

    for handler_dir in glob.glob("mindsdb/integrations/handlers/*/"):
        handler_name = handler_dir.split("/")[-2].split("_handler")[0]

        # regex for finding imports of other handlers like "from mindsdb.integrations.handlers.file_handler import FileHandler"
        # excludes the current handler importing parts of itself
        import_pattern = re.compile(
            f"(?:\s|^)(?:from|import) mindsdb\.integrations\.handlers\.(?!{handler_name}_handler)\w+_handler")  # noqa: W605

        # requirements entries for this handler that point to another handler's requirements file
        required_handlers = get_relative_requirements(
            [file for file in HANDLER_REQS_PATHS if file.startswith(handler_dir)])

        all_imported_handlers = []

        # for every python file in this handler's code
        for file in glob.glob(f"{handler_dir}/**/*.py", recursive=True):
            errors = []

            # find all the imports of handlers
            with open(file, "r") as f:
                file_content = f.read()
                relative_imported_handlers = [match.strip() for match in
                                              re.findall(relative_import_pattern, file_content)]
                handler_import_lines = [match.strip() for match in re.findall(import_pattern, file_content)]

            imported_handlers = {line: line.split("_handler")[0].split(".")[-1] + "_handler" for line in
                                 handler_import_lines}
            all_imported_handlers += imported_handlers.values()

            # Report on relative imports (like "from ..file_handler import FileHandler")
            for line in relative_imported_handlers:
                errors.append(f"{line} <- Relative import of handler. Use absolute import instead")

            # Report on imports of other handlers that are missing a corresponding requirements.txt entry
            for line, imported_handler_name in imported_handlers.items():
                # Check if the imported handler has a requirements.txt file.
                imported_handler_req_file = f"mindsdb/integrations/handlers/{imported_handler_name}/requirements.txt"
                if os.path.exists(imported_handler_req_file):
                    if imported_handler_name not in required_handlers.keys():
                        errors.append(
                            f"{line} <- {imported_handler_name} not in handler requirements.txt. Add it like: \"-r {imported_handler_req_file}\"")

            # Print all the errors for this .py file
            print_errors(file, errors)

        # Report on requirements.txt entries that point to a handler that isn't used
        requirements_errors = [required_handler_name + " in requirements.txt but not used in code" for required_handler_name in required_handlers.keys() if
                               required_handler_name not in all_imported_handlers]
        print_errors(handler_dir, requirements_errors)

        # Report on requirements.txt entries that point to a handler requirements file that doesn't exist
        errors = []
        for _, required_handler_line in required_handlers.items():
            if not os.path.exists(required_handler_line.split('-r ')[1]):
                errors.append(f"{required_handler_line} <- this requirements file doesn't exist.")

        print_errors(handler_dir, errors)


def check_requirements_imports():
    """
    Use deptry to find issues with dependencies.

    Runs deptry on the core codebase (excluding handlers) + the main requirements.txt file.
    Then runs it on each handler codebase and requirements.txt individually.
    """

    # Run against the main codebase
    errors = run_deptry(
        ','.join([MAIN_REQS_PATH]),
        get_ignores_str(MAIN_RULE_IGNORES),
        ".",
        f"--extend-exclude \"{'|'.join(MAIN_EXCLUDE_PATHS)}\"",
    )
    print_errors(MAIN_REQS_PATH, errors)

    # Run on each handler
    for file in HANDLER_REQS_PATHS:
        errors = run_deptry(
            f"{file},{MAIN_REQS_PATH},{TEST_REQS_PATH}",
            get_ignores_str(HANDLER_RULE_IGNORES),
            os.path.dirname(file),
        )
        print_errors(file, errors)


print("--- Checking requirements files for duplicates ---")
check_for_requirements_duplicates()
print()

print("--- Checking that requirements match imports ---")
check_requirements_imports()
print()

print("--- Checking handlers that require other handlers ---")
check_relative_reqs()

sys.exit(0 if success else 1)