diff --git a/meta_creator/metadata_extractor.py b/meta_creator/metadata_extractor.py index 0f02de7..81874c4 100644 --- a/meta_creator/metadata_extractor.py +++ b/meta_creator/metadata_extractor.py @@ -12,10 +12,8 @@ from .gitlab_metadata import get_gitlab_metadata from .read_tokens import read_token_from_file from .hermes_process import run_hermes_commands -import json -import os - - +from .spdx_utils import validate_license + #################### getting metadata from github/gitlab project #################### @csrf_exempt @@ -55,7 +53,15 @@ def data_extraction(request): if not extracted_metadata: extracted_metadata = get_gitlab_metadata(gl_url, default_access_token_gitlab) - result['metadata'] = init_curated_metadata(extracted_metadata) + output = validate_license(extracted_metadata) + if output.get('success') is False: + return { + 'success': False, + 'errors': 'No valid license found in metadata.' + } + else: + result['metadata'] = init_curated_metadata(extracted_metadata) + else: # TODO we need to pass the token to hermes_process @@ -66,12 +72,20 @@ def data_extraction(request): # hermes_metadata = get_github_metadata(gl_url, default_access_token_GH) if isinstance(hermes_metadata, dict): - result['metadata'] = init_curated_metadata(hermes_metadata.get('metadata')) - result['warnings'].extend(hermes_metadata.get('warnings', [])) - result['errors'].extend(hermes_metadata.get('errors', [])) - result['success'] = hermes_metadata.get('success', False) + extracted_metadata = hermes_metadata.get('metadata') + if extracted_metadata: + output = validate_license(extracted_metadata) + if output.get('success') is False: + return { + 'success': False, + 'errors': 'No valid license found in metadata.' + } + else: + result['metadata'] = init_curated_metadata(hermes_metadata.get('metadata')) + result['warnings'].extend(hermes_metadata.get('warnings', [])) + result['errors'].extend(hermes_metadata.get('errors', [])) + result['success'] = hermes_metadata.get('success', False) else: result['success'] = False result['errors'].append("HERMES returned unexpected result format.") - return result \ No newline at end of file diff --git a/meta_creator/spdx_utils.py b/meta_creator/spdx_utils.py new file mode 100644 index 0000000..e1ff531 --- /dev/null +++ b/meta_creator/spdx_utils.py @@ -0,0 +1,58 @@ +import requests +import re +from functools import lru_cache + +SPDX_URL = 'https://raw.githubusercontent.com/spdx/license-list-data/master/json/licenses.json' + +@lru_cache(maxsize=1) +def get_spdx_licenses(): + licenses = set() + response = requests.get(SPDX_URL, timeout=10) + response.raise_for_status() + data = response.json() + + for license_entry in data.get("licenses", []): + if not license_entry.get("isDeprecatedLicenseId", False): + licenses.add(license_entry["licenseId"]) + + return licenses + +def extract_license_from_metadata(metadata): + if not isinstance(metadata, dict): + return None + + def extract_spdx_id_from_url(url): + match = re.search(r'spdx\.org/licenses/([A-Za-z0-9\.-]+)', url) + return match.group(1) if match else None + + license_data = metadata.get('license') + + if isinstance(license_data, list): + for item in license_data: + if isinstance(item, str): + spdx_id = extract_spdx_id_from_url(item) + if spdx_id: + return spdx_id + + if isinstance(license_data, dict): + return license_data.get('spdx_id') or license_data.get('key') + + if isinstance(license_data, str): + return extract_spdx_id_from_url(license_data) or license_data.strip() + + if 'spdx_license' in metadata: + return metadata['spdx_license'].strip() + + return None + +def validate_license(metadata): + license_id = extract_license_from_metadata(metadata) + + if not license_id: + return {'success': False} + + spdx_licenses = get_spdx_licenses() + if license_id in spdx_licenses: + return {'success': True} + else: + return {'success': False} \ No newline at end of file