diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 8dd399a..0000000 --- a/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -max-line-length = 88 -extend-ignore = E203 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 19f4425..a3b5938 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -43,9 +43,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi + pip install -e ".[dev]" - name: Test with pytest run: | pytest tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 03dfe47..fc65887 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.12.3 + rev: v0.12.5 hooks: # Run the linter. - id: ruff-check diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 0ac2298..e0a08f6 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,15 +7,14 @@ formats: build: os: ubuntu-22.04 tools: - python: "3.11" - + python: "3.12" sphinx: configuration: docs/conf.py python: install: - - requirements: requirements.txt - - requirements: requirements-dev.txt - method: pip path: . + extra_requirements: + - dev diff --git a/LICENSE b/LICENSE index f4b0791..8566649 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022 WEFE Team +Copyright (c) 2025 WEFE Team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index d6ab8d6..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,2 +0,0 @@ -include requirements.txt -include wefe/datasets/data/* diff --git a/README.rst b/README.rst index fc782bc..83d2e83 100644 --- a/README.rst +++ b/README.rst @@ -55,91 +55,155 @@ The official documentation can be found at this `link =3.8.3) +- numpy (<=1.26.4) +- pandas (>=2.0.0) +- plotly (>=6.0.0) +- requests (>=2.22.0) +- scikit-learn (>=1.5.0) +- scipy (<1.13) +- semantic_version (>=2.8.0) +- tqdm (>=4.0.0) Contributing ------------ -You can download the code executing :: +To contribute to WEFE development: + +1. **Clone the repository**:: git clone https://github.com/dccuchile/wefe + cd wefe + +2. **Install in development mode with all dependencies**:: + pip install -e ".[dev]" + +3. **Run tests to ensure everything works**:: + + pytest tests -To contribute, visit the `Contributing `_ section in the documentation. +4. **Make your changes and run tests again** + +5. **Follow our coding standards**: + - Use ``ruff`` for code formatting: ``ruff format .`` + - Check code quality: ``ruff check .`` + - Run type checking: ``mypy wefe`` + +For detailed contributing guidelines, visit the `Contributing `_ section in the documentation. Development Requirements ------------------------ -To install the necessary dependencies for the development, testing and compilation -of WEFE documentation, run :: +To install WEFE with all development dependencies for testing, documentation building, and code quality tools:: + + pip install "wefe[dev]" - pip install -r requirements-dev.txt +This installs additional packages including: + +- pytest and pytest-cov for testing +- sphinx and related packages for documentation +- ruff for code formatting and linting +- mypy for type checking +- ipython for interactive development Testing ------- -All unit tests are in the wefe/tests folder. It uses ``pytest`` as a framework to -run them. +All unit tests are in the ``tests/`` folder. WEFE uses ``pytest`` as the testing framework. -To run the test, execute:: +To run all tests:: pytest tests -To check the coverage, run:: +To run tests with coverage reporting:: + + pytest tests --cov=wefe --cov-report=html - pytest tests --cov-report xml:cov.xml --cov wefe +To run a specific test file:: -And then:: + pytest tests/test_datasets.py - coverage report -m +Coverage reports will be generated in ``htmlcov/`` directory. Build the documentation ----------------------- -The documentation is created using sphinx. -It can be found in the docs folder at the root of the project. -To compile the documentation, run: +The documentation is built using Sphinx and can be found in the ``docs/`` folder. + +To build the documentation:: -.. code-block:: bash + cd docs + make html + +Or using the development environment:: + pip install "wefe[dev]" cd docs make html -Then, you can visit the documentation at ``docs/_build/html/index.html`` +The built documentation will be available at ``docs/_build/html/index.html`` Changelog ========= +Version 1.0.0 +------------------- + +**Major Release - Breaking Changes** + +- **Python 3.10+ Required**: Dropped support for Python 3.6-3.9 +- **Modern Packaging**: Migrated from ``setup.py`` to ``pyproject.toml`` +- **Updated Dependencies**: All packages updated for modern Python ecosystem + +**New Features**: +- Robust dataset fetching with retry mechanism and exponential backoff +- HTTP 429 (rate limiting) and timeout error handling +- Optional dependencies: ``pip install "wefe[dev]"`` and ``"wefe[pytorch]"`` +- Dynamic version loading from ``wefe.__version__`` + +**Core Improvements**: +- **WordEmbeddingModel**: Enhanced type safety, better gensim compatibility, improved error handling +- **BaseMetric**: Refactored input validation, standardized ``run_query`` methods across all metrics +- **Testing**: Converted to pytest patterns with monkeypatch, comprehensive test coverage +- **Code Quality**: Migration from flake8 to Ruff, enhanced documentation with detailed docstrings + +**Development Workflow**: +- GitHub Actions upgraded with Python 3.10-3.13 matrix testing +- Pre-commit hooks enhanced with JSON/TOML validation and security checks +- Modernized Sphinx documentation configuration +- Updated benchmark documentation and metrics comparison tables + Version 0.4.1 ------------------- diff --git a/docs/benchmark/benchmark.rst b/docs/benchmark/benchmark.rst index f7443fc..9dfffdd 100644 --- a/docs/benchmark/benchmark.rst +++ b/docs/benchmark/benchmark.rst @@ -1096,29 +1096,42 @@ methods they implement to date. Fairness Metrics ~~~~~~~~~~~~~~~~ -=================== ========================= ==== === =========== =================== -Metric Implementable in WEFE WEFE FEE Responsibly EmbeddingBiasScores -=================== ========================= ==== === =========== =================== -WEAT ✔ ✔ ✔ ✔ ✔ -WEAT ES ✔ ✔ ✖ ✖ ✖ -RNSB ✔ ✔ ✖ ✖ ✖ -RIPA ✔ ✔ ✖ ✖ ✔ -ECT ✔ ✔ ✖ ✖ ✖ -RND ✔ ✔ ✖ ✖ ✖ -MAC ✔ ✔ ✖ ✖ ✔ -Direct Bias ✔ ✖ ✔ ✔ ✔ -SAME ✔ ✖ ✖ ✖ ✔ -Generalized WEAT ✔ ✖ ✖ ✖ ✔ -IndirectBias ✖ ✖ ✖ ✔ ✖ -GIPE ✖ ✖ ✔ ✖ ✖ -PMN ✖ ✖ ✔ ✖ ✖ -Proximity Bias ✖ ✖ ✔ ✖ ✖ -=================== ========================= ==== === =========== =================== - -The following metrics are not compatible with WEFE because they do not align with the abstractions defined by the framework: - -- IndirectBias, which accepts only two words and a gender direction as input, relying on a previously calculated bias direction. -- GIPE, PMN, and Proximity Bias, which evaluate word embedding models before and after debiasing using auxiliary mitigation methods. ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| Metric | Implementable in WEFE | WEFE | FEE | Responsibly | EmbeddingBiasScores | ++=====================+============================+======+======+=============+=======================+ +| WEAT | ✔ | ✔ | ✔ | ✔ | ✔ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| WEAT ES | ✔ | ✔ | ✖ | ✖ | ✖ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| RNSB | ✔ | ✔ | ✖ | ✖ | ✖ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| RIPA | ✔ | ✔ | ✖ | ✖ | ✔ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| ECT | ✔ | ✔ | ✖ | ✖ | ✖ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| RND | ✔ | ✔ | ✖ | ✖ | ✖ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| MAC | ✔ | ✔ | ✖ | ✖ | ✔ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| Direct Bias | ✔ | ✖ | ✔ | ✔ | ✔ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| SAME | ✔ | ✖ | ✖ | ✖ | ✔ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| Generalized WEAT | ✔ | ✖ | ✖ | ✖ | ✔ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| IndirectBias | ✖ | ✖ | ✖ | ✔ | ✖ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| GIPE | ✖ | ✖ | ✔ | ✖ | ✖ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| PMN | ✖ | ✖ | ✔ | ✖ | ✖ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ +| Proximity Bias | ✖ | ✖ | ✔ | ✖ | ✖ | ++---------------------+----------------------------+------+------+-------------+-----------------------+ + +Metrics marked as "✔" in the "Implementable in WEFE" column can be implemented directly within +the WEFE framework using word sets as input. +Metrics marked as "✖" require additional representations such as gender directions +or apply before/after transformations, and are therefore currently out of WEFE's scope. Mitigation algorithms @@ -1143,7 +1156,7 @@ libraries analyzed in this benchmark study. ==================================================== ========================================= ========================================================== ========================================== ==================================== Item WEFE FEE Responsibly EmbeddingBiasScores ==================================================== ========================================= ========================================================== ========================================== ==================================== - Implemented Metrics 7 7 3 6 + Implemented Metrics 7 5 3 6 Implemented Mitigation Algorithms 5 3 1 0 Extensible Easy Easy Difficult, not very modular. Easy Well-defined interface for metrics ✔ ✖ ✖ ✔ diff --git a/docs/conf.py b/docs/conf.py index 231c252..7c45899 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,346 +1,226 @@ +# Configuration file for the Sphinx documentation builder. # -# project-template documentation build configuration file, created by -# sphinx-quickstart on Mon Jan 18 14:44:12 2016. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html import os +from pathlib import Path import sys -# import sphinx -# import sphinx_rtd_theme +# Add the project root to the Python path +project_root = Path(__file__).parent.parent +sys.path.insert(0, str(project_root)) + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "WEFE" +copyright = "2025, The WEFE Team" +author = "The WEFE Team" -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath("..")) -# -- General configuration ------------------------------------------------ +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +from wefe import __version__ # noqa: E402 -# If your documentation needs a minimal Sphinx version, state it here. -# needs_sphinx = '1.0' +version = __version__ +release = __version__ +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +# Minimum Sphinx version required +needs_sphinx = "4.0" # Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.doctest", "sphinx.ext.intersphinx", "sphinx.ext.viewcode", - "numpydoc", - "sphinx_gallery.gen_gallery", + "sphinx.ext.napoleon", "sphinx.ext.todo", "sphinx.ext.mathjax", "sphinx.ext.ifconfig", - "sphinx.ext.napoleon", + "numpydoc", + "sphinx_gallery.gen_gallery", "sphinx_copybutton", ] +# Configure autodoc +autodoc_default_options = { + "members": True, + "inherited-members": True, + "undoc-members": True, + "show-inheritance": True, +} -# this is needed for some reason... -# see https://github.com/numpy/numpydoc/issues/69 +# Generate autosummary even if no references +autosummary_generate = True + +# NumPy-style docstring configuration numpydoc_show_class_members = False +numpydoc_class_members_toctree = False -# pngmath / imgmath compatibility layer for different sphinx versions +# Napoleon settings for Google/NumPy style docstrings +napoleon_google_docstring = True +napoleon_numpy_docstring = True +napoleon_include_init_with_doc = False +napoleon_include_private_with_doc = False +# Math rendering configuration if os.environ.get("NO_MATHJAX"): extensions.append("sphinx.ext.imgmath") imgmath_image_format = "svg" mathjax_path = "" else: - extensions.append("sphinx.ext.mathjax") mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js" - -autodoc_default_flags = ["members", "inherited-members"] - -# Add any paths that contain templates here, relative to this directory. + mathjax_config = { + "tex": { + "inlineMath": [["$", "$"], ["\\(", "\\)"]], + "displayMath": [["$$", "$$"], ["\\[", "\\]"]], + } + } + +# Source file configuration templates_path = ["_templates"] - -# generate autosummary even if no references -autosummary_generate = True - -# The suffix of source filenames. -# source_suffix = ".rst" source_suffix = { ".rst": "restructuredtext", } - -# The encoding of source files. -# source_encoding = 'utf-8-sig' - -# Generate the plots for the gallery -plot_gallery = False - -# The master toctree document. master_doc = "index" +exclude_patterns = ["_build", "_templates", "Thumbs.db", ".DS_Store"] -# General information about the project. -project = "WEFE" -copyright = "The WEFE Team" - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -from wefe import __version__ # noqa: E402 - -# __version__ = '0.0.1' -version = __version__ -# The full version, including alpha/beta/rc tags. -release = __version__ - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -# today = '' -# Else, today_fmt is used as the format for a strftime call. -# today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ["_build", "_templates"] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -# default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -add_function_parentheses = False +# Internationalization +language = "en" -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. +# Syntax highlighting pygments_style = "sphinx" -# Custom style -# html_style = "css/project-template.css" - -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -# If true, keep warnings as "system message" paragraphs in the built documents. -# keep_warnings = False - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. +# HTML theme configuration html_theme = "sphinx_rtd_theme" -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. html_theme_options = { "logo_only": True, "display_version": True, + "prev_next_buttons_location": "bottom", + "style_external_links": False, + "collapse_navigation": True, + "sticky_navigation": True, + "navigation_depth": 4, + "includehidden": True, + "titles_only": False, } -# Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -# html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. +# HTML output configuration +html_title = f"WEFE v{version} Documentation" html_short_title = "WEFE Documentation" - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. html_logo = "logos/WEFE_2_BLANCO.svg" - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. html_favicon = "logos/WEFE.ico" -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". +# Static files and styling html_static_path = ["_static"] +html_css_files = [ + "css/theme_overrides.css", +] -# html_context = { -# "css_files": [ -# "_static/css/theme_overrides.css", # overrides for wide tables in RTD theme -# ], -# } - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -# html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -# html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -# html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -# html_additional_pages = {} - -# If false, no module index is generated. -# html_domain_indices = True - -# If false, no index is generated. -# html_use_index = True - -# If true, the index is split into individual pages for each letter. -# html_split_index = False - -# If true, links to the reST sources are added to the pages. -# html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -# html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None +# JavaScript files +html_js_files = [ + "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js" +] -# Output file base name for HTML help builder. -htmlhelp_basename = "wefe" +# Additional HTML options +html_show_sourcelink = True +html_show_sphinx = True +html_show_copyright = True +html_copy_source = True +html_use_opensearch = "" -# -- Options for LaTeX output --------------------------------------------- +# -- Options for LaTeX output ------------------------------------------------ latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. + "papersize": "letterpaper", + "pointsize": "10pt", "preamble": r""" - \usepackage{amsmath}\usepackage{amsfonts}\usepackage{bm} - \usepackage{morefloats}\usepackage{enumitem} \setlistdepth{10} + \usepackage{amsmath} + \usepackage{amsfonts} + \usepackage{bm} + \usepackage{morefloats} + \usepackage{enumitem} + \setlistdepth{10} \let\oldhref\href \renewcommand{\href}[2]{\oldhref{#1}{\hbox{#2}}} - """ + """, } -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). latex_documents = [ - ("index", "WEFE.tex", "WEFE Documentation", "WEFE Team", "manual"), + (master_doc, "WEFE.tex", "WEFE Documentation", "WEFE Team", "manual"), ] -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# latex_use_parts = True - -# If true, show page references after internal links. -# latex_show_pagerefs = False - -# If true, show URL addresses after external links. -# latex_show_urls = False - -# Documents to append as an appendix to all manuals. -# latex_appendices = [] - -# If false, no module index is generated. -# latex_domain_indices = True - -# -- Options for manual page output --------------------------------------- +# -- Options for manual page output ------------------------------------------ -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [("index", "WEFE", "WEFE Documentation", ["The WEFE Team"], 1)] +man_pages = [(master_doc, "wefe", "WEFE Documentation", [author], 1)] -# If true, show URL addresses after external links. -# man_show_urls = False +# -- Options for Texinfo output ---------------------------------------------- -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) texinfo_documents = [ ( - "index", + master_doc, "WEFE", "WEFE Documentation", - "The WEFE Team", + author, "WEFE", - "Word Embedding Fairness Evaluation (WEFE) is an open source library for\ - measuring and mitigating bias in word embedding models.", - "Word Embeddings, Fairness", + "Word Embedding Fairness Evaluation (WEFE) is an open source library for " + "measuring and mitigating bias in word embedding models.", + "Miscellaneous", ), ] -# Documents to append as an appendix to all manuals. -# texinfo_appendices = [] - -# If false, no module index is generated. -# texinfo_domain_indices = True +# -- Options for EPUB output ------------------------------------------------- -# How to display URL addresses: 'footnote', 'no', or 'inline'. -# texinfo_show_urls = 'footnote' +epub_show_urls = "footnote" -# If true, do not generate a @detailmenu in the "Top" node's menu. -# texinfo_no_detailmenu = False +# -- Extension-specific configuration ---------------------------------------- -# Example configuration for intersphinx: refer to the Python standard library. -# intersphinx configuration +# Intersphinx configuration for cross-references to other projects intersphinx_mapping = { - "python": (f"https://docs.python.org/{sys.version_info.major}", None), + "python": ("https://docs.python.org/3", None), "numpy": ("https://numpy.org/doc/stable/", None), "scipy": ("https://docs.scipy.org/doc/scipy/", None), "matplotlib": ("https://matplotlib.org/stable", None), "sklearn": ("https://scikit-learn.org/stable/", None), + "pandas": ("https://pandas.pydata.org/docs/", None), + "gensim": ("https://radimrehurek.com/gensim/", None), } -# sphinx-gallery configuration +# Sphinx Gallery configuration sphinx_gallery_conf = { "doc_module": "wefe", - "backreferences_dir": os.path.join("generated"), + "backreferences_dir": "generated", "reference_url": {"wefe": None}, + "examples_dirs": "../examples", + "gallery_dirs": "auto_examples", + "filename_pattern": r"\.py$", + "ignore_pattern": r"__init__\.py", + "download_all_examples": False, + "plot_gallery": "True", } -html_js_files = [ - "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js" -] - -# copybutton conf. -copybutton_only_copy_prompt_lines = True +# Copy button configuration copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " copybutton_prompt_is_regexp = True +copybutton_only_copy_prompt_lines = True + +# TODO extension +todo_include_todos = False + +# Output file base name for HTML help builder +htmlhelp_basename = "wefe" def setup(app) -> None: - # a copy button to copy snippet of code from the documentation + """Sphinx setup hook.""" + # Add custom CSS for theme overrides app.add_css_file("css/theme_overrides.css") diff --git a/docs/sg_execution_times.rst b/docs/sg_execution_times.rst new file mode 100644 index 0000000..78433a9 --- /dev/null +++ b/docs/sg_execution_times.rst @@ -0,0 +1,37 @@ + +:orphan: + +.. _sphx_glr_sg_execution_times: + + +Computation times +================= +**00:00.000** total execution time for 0 files **from all galleries**: + +.. container:: + + .. raw:: html + + + + + + + + .. list-table:: + :header-rows: 1 + :class: table table-striped sg-datatable + + * - Example + - Time + - Mem (MB) + * - N/A + - N/A + - N/A diff --git a/examples/Contributing.ipynb b/examples/Contributing.ipynb index cca601c..3fd63cf 100644 --- a/examples/Contributing.ipynb +++ b/examples/Contributing.ipynb @@ -275,7 +275,8 @@ }, "outputs": [], "source": [ - "from typing import Any, Callable, Union\n", + "from collections.abc import Callable\n", + "from typing import Any\n", "\n", "import numpy as np\n", "\n", @@ -298,7 +299,7 @@ " query: Query,\n", " model: WordEmbeddingModel,\n", " lost_vocabulary_threshold: float = 0.2,\n", - " preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}],\n", + " preprocessors: list[dict[str, str | bool | Callable]] = [{}],\n", " strategy: str = \"first\",\n", " normalize: bool = False,\n", " warn_not_found_words: bool = False,\n", @@ -464,7 +465,8 @@ }, "outputs": [], "source": [ - "from typing import Any, Callable, Union\n", + "from collections.abc import Callable\n", + "from typing import Any\n", "\n", "import numpy as np\n", "from scipy.spatial import distance\n", @@ -530,7 +532,7 @@ " query: Query,\n", " model: WordEmbeddingModel,\n", " lost_vocabulary_threshold: float = 0.2,\n", - " preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}],\n", + " preprocessors: list[dict[str, str | bool | Callable]] = [{}],\n", " strategy: str = \"first\",\n", " normalize: bool = False,\n", " warn_not_found_words: bool = False,\n", diff --git a/mypy.ini b/mypy.ini index 7d39e3d..99f9d3f 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,4 +1,14 @@ [mypy] -ignore_missing_imports = True -show_column_numbers = True -follow_imports = silent +python_version = 3.10 +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true +disallow_untyped_decorators = true +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +warn_no_return = true +warn_unreachable = true +strict_equality = true diff --git a/pyproject.toml b/pyproject.toml index 682ed79..e0ceb11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,102 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "wefe" +dynamic = ["version"] +description = "The Word Embedding Fairness Evaluation Framework" +readme = "README.rst" +license = "MIT" +authors = [ + { name = "Pablo Badilla", email = "pablo.badilla@ug.uchile.cl" }, + { name = "Felipe Bravo", email = "fbravo@dcc.uchile.cl" }, + { name = "WEFE Team", email = "fbravo@dcc.uchile.cl" }, +] +maintainers = [{ name = "WEFE Team", email = "fbravo@dcc.uchile.cl" }] +keywords = [ + "bias", + "fairness", + "nlp", + "word embeddings", + "machine learning", + "artificial intelligence", +] +classifiers = [ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", +] +requires-python = ">=3.10" +dependencies = [ + "gensim>=3.8.3", + "numpy<=1.26.4", + "pandas>=2.0.0", + "plotly>=6.0.0", + "requests>=2.22.0", + "scikit-learn>=1.5.0", + "scipy<1.13", + "semantic_version>=2.8.0", + "tqdm>=4.0.0", +] + +[project.optional-dependencies] +pytorch = ["torch>=2.1.0"] +dev = [ + "docutils>=0.21.2", + "ipython>=8.25.0", + "mypy>=1.17.0", + "numpydoc>=1.9.0", + "pytest>=8.4.1", + "pytest-cov>=6.2.1", + "ruff>=0.12.3", + "Sphinx>=8.1.0", + "sphinx-copybutton>=0.5.2", + "sphinx-gallery>=0.19.0", + "sphinx-rtd-theme>=3.0.2", + "torch>=2.1.0", + "urllib3>=2.5.0", +] + +[project.urls] +Homepage = "https://github.com/dccuchile/wefe" +Documentation = "https://wefe.readthedocs.io/" +Repository = "https://github.com/dccuchile/wefe" +"Bug Tracker" = "https://github.com/dccuchile/wefe/issues" + +[tool.setuptools] +include-package-data = true +zip-safe = false + +[tool.setuptools.dynamic] +version = { attr = "wefe.__version__" } + +[tool.setuptools.packages.find] +where = ["."] +include = ["wefe*"] +exclude = ["tests*"] + +[tool.setuptools.package-data] +wefe = ["datasets/data/*"] + +[tool.pytest.ini_options] +addopts = "--doctest-modules --ignore=wefe" +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] + [tool.ruff] exclude = [ @@ -48,3 +147,15 @@ select = [ ] ignore = ["B006"] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["D", "S101"] # Ignore docstring and assert rules in tests +"__init__.py" = ["F401"] # Ignore unused imports in __init__.py + + +[tool.ruff.lint.isort] +known-first-party = ["wefe"] +force-sort-within-sections = true + +[tool.ruff.lint.pydocstyle] +convention = "numpy" diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 56f4102..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,13 +0,0 @@ -docutils>=0.21.2 -ipython>=8.25.0 -mypy>=1.17.0 -numpydoc>=1.9.0 -pytest>=8.4.1 -pytest-cov>=6.2.1 -ruff>=0.12.3 -Sphinx>=8.1.0 -sphinx-copybutton>=0.5.2 -sphinx-gallery>=0.19.0 -sphinx-rtd-theme>=3.0.2 -torch>=2.1.0 -urllib3>=2.5.0 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 8fb2c79..0000000 --- a/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -gensim>=3.8.3 -numpy<=1.26.4 -pandas>=2.0.0 -plotly>=6.0.0 -requests>=2.22.0 -scikit-learn>=1.5.0 -scipy<1.13 -semantic_version>=2.8.0 -tqdm>=4.0.0 diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 55d991d..0000000 --- a/setup.cfg +++ /dev/null @@ -1,8 +0,0 @@ -[metadata] -description-file = README.rst - -[aliases] -test = pytest - -[tool:pytest] -addopts = --doctest-modules diff --git a/setup.py b/setup.py deleted file mode 100644 index df94894..0000000 --- a/setup.py +++ /dev/null @@ -1,73 +0,0 @@ -#! /usr/bin/env python -"""The Word Embeddings Fairness Evaluation Framework.""" - -import codecs - -from setuptools import find_packages, setup - -import wefe - -DISTNAME = "wefe" -DESCRIPTION = "The Word Embedding Fairness Evaluation Framework" -with codecs.open("README.rst", encoding="utf-8-sig") as f: - LONG_DESCRIPTION = f.read() -AUTHOR = "WEFE Team" -MAINTAINER = "WEFE Team" -MAINTAINER_EMAIL = "pablo.badilla@ug.uchile.cl" -URL = "https://github.com/dccuchile/wefe" -LICENSE = "new BSD" -DOWNLOAD_URL = "https://github.com/dccuchile/wefe" -VERSION = wefe.__version__ -INSTALL_REQUIRES = [ - "numpy", - "scipy", - "scikit-learn", - "pandas", - "gensim", - "plotly", - "six", - "requests", - "semantic_version", - "tqdm", -] -CLASSIFIERS = [ - "Intended Audience :: Science/Research", - "Intended Audience :: Developers", - "License :: OSI Approved", - "Programming Language :: Python", - "Topic :: Software Development", - "Topic :: Scientific/Engineering", - "Operating System :: Microsoft :: Windows", - "Operating System :: POSIX", - "Operating System :: Unix", - "Operating System :: MacOS", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", -] -EXTRAS_REQUIRE = { - "pytorch": ["torch"], -} - -setup( - name=DISTNAME, - author=AUTHOR, - maintainer=MAINTAINER, - maintainer_email=MAINTAINER_EMAIL, - description=DESCRIPTION, - license=LICENSE, - url=URL, - version=VERSION, - download_url=DOWNLOAD_URL, - long_description=LONG_DESCRIPTION, - long_description_content_type="text/x-rst", - zip_safe=False, # the package can run out of an .egg file - classifiers=CLASSIFIERS, - packages=find_packages(), - install_requires=INSTALL_REQUIRES, - extras_require=EXTRAS_REQUIRE, - python_requires=">=3.6", - include_package_data=True, -) diff --git a/tests/debias/conftest.py b/tests/debias/conftest.py index 8b01a4a..2bd69ae 100644 --- a/tests/debias/conftest.py +++ b/tests/debias/conftest.py @@ -1,7 +1,5 @@ """Test configurations and fixtures.""" -from typing import Union - import numpy as np import pytest @@ -149,7 +147,7 @@ def gender_query_2(weat_wordsets: dict[str, list[str]]) -> Query: @pytest.fixture def gender_query_3( - multiclass_debias_wordsets: dict[str, dict[str, Union[list[str], list]]], + multiclass_debias_wordsets: dict[str, dict[str, list[str] | list]], ) -> Query: """Generate a Male and Female names wrt Career vs Family terms test query. diff --git a/tests/metrics/test_MAC.py b/tests/metrics/test_MAC.py index b8e408e..dd31437 100644 --- a/tests/metrics/test_MAC.py +++ b/tests/metrics/test_MAC.py @@ -34,7 +34,7 @@ def check_MAC_result_values(results: dict[str, Any]) -> None: assert isinstance(attribute_scores, dict) for attribute_name, attribute_score in attribute_scores.items(): assert isinstance(attribute_name, str) - assert isinstance(attribute_score, (np.number, float)) + assert isinstance(attribute_score, np.number | float) def test_MAC(model, query_1t4_1) -> None: diff --git a/tests/metrics/test_RIPA.py b/tests/metrics/test_RIPA.py index 578f92f..8892400 100644 --- a/tests/metrics/test_RIPA.py +++ b/tests/metrics/test_RIPA.py @@ -17,15 +17,15 @@ def check_RIPA_result_values(results: dict[str, Any]) -> None: # note: this checking only applies when the result is not np.nan. assert isinstance(results["query_name"], str) - assert isinstance(results["result"], (np.number, float)) - assert isinstance(results["ripa"], (np.number, float)) + assert isinstance(results["result"], np.number | float) + assert isinstance(results["ripa"], np.number | float) assert isinstance(results["word_values"], dict) for word, word_value in results["word_values"].items(): assert isinstance(word, str) assert isinstance(word_value, dict) - assert isinstance(word_value["mean"], (np.number, float)) - assert isinstance(word_value["std"], (np.number, float)) + assert isinstance(word_value["mean"], np.number | float) + assert isinstance(word_value["std"], np.number | float) def test_RIPA(model: WordEmbeddingModel, query_2t1a_1: Query) -> None: diff --git a/tests/metrics/test_RND.py b/tests/metrics/test_RND.py index bcdbb28..b8bf30f 100644 --- a/tests/metrics/test_RND.py +++ b/tests/metrics/test_RND.py @@ -32,7 +32,7 @@ def check_RND_result_values(results: dict[str, Any]) -> None: assert len(distances_by_word) > 0 for word, distance in distances_by_word.items(): assert isinstance(word, str) - assert isinstance(distance, (float, np.number)) + assert isinstance(distance, float | np.number) assert len(word) > 0 diff --git a/tests/metrics/test_WEAT.py b/tests/metrics/test_WEAT.py index d511d08..66e7b26 100644 --- a/tests/metrics/test_WEAT.py +++ b/tests/metrics/test_WEAT.py @@ -31,7 +31,7 @@ def check_WEAT_result_values(results: dict[str, Any]) -> None: assert isinstance(results["effect_size"], np.number) # check p_value options - assert isinstance(results["p_value"], (float, np.number)) or np.isnan( + assert isinstance(results["p_value"], float | np.number) or np.isnan( results["p_value"] ) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 6e51d6d..e30063f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,4 +1,3 @@ -import socket import urllib.error import pytest @@ -84,7 +83,7 @@ def test_fetch_debiaswe() -> None: assert isinstance(set_, list) assert len(set_) > 0 for word in set_: - assert isinstance(word, (str, list)) + assert isinstance(word, str | list) assert len(word) > 0 @@ -123,11 +122,11 @@ def test_fetch_debias_multiclass() -> None: for set_name, set_ in debias_multiclass_dataset.items(): assert isinstance(set_name, str) - assert isinstance(set_, (list, dict)) + assert isinstance(set_, list | dict) if isinstance(set_, list): assert len(set_) > 0 for word in set_: - assert isinstance(word, (str, list)) + assert isinstance(word, str | list) assert len(word) > 0 @@ -253,7 +252,7 @@ def test_retry_request_timeout_error(self, monkeypatch): mock_func = Mock() # First call fails with timeout, second succeeds - mock_func.side_effect = [socket.timeout("Connection timeout"), "success"] + mock_func.side_effect = [TimeoutError("Connection timeout"), "success"] result = _retry_request(mock_func, n_retries=2) diff --git a/tests/test_query.py b/tests/test_query.py index 3b7333d..4ad1abd 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -232,7 +232,7 @@ def test_templates(weat_wordsets: dict[str, list[str]]) -> None: ["Weapons", "Instruments"], ] - for target_name, subquery in zip(target_names, subqueries): + for target_name, subquery in zip(target_names, subqueries, strict=False): assert target_name == subquery.target_sets_names # attribute subqueries @@ -240,7 +240,7 @@ def test_templates(weat_wordsets: dict[str, list[str]]) -> None: attribute_names = [["Pleasant"], ["Unpleasant"]] assert len(subqueries) == 2 - for attribute_name, subquery in zip(attribute_names, subqueries): + for attribute_name, subquery in zip(attribute_names, subqueries, strict=False): assert attribute_name == subquery.attribute_sets_names diff --git a/tests/test_utils.py b/tests/test_utils.py index 5872391..2cd376b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,9 @@ import gensim +from gensim.models import KeyedVectors import numpy as np import pandas as pd import pytest import semantic_version -from gensim.models import KeyedVectors from wefe.datasets import load_weat from wefe.metrics import RND, WEAT @@ -213,13 +213,13 @@ def check_results_types(results, only_negative=False) -> None: "Male terms and Female terms wrt Math and Arts", ] - for given_col, expected_col in zip(results.columns, expected_cols): + for given_col, expected_col in zip(results.columns, expected_cols, strict=False): assert given_col == expected_col # Check index expected_index = ["dummy_model_1", "dummy_model_2", "dummy_model_3"] - for given_idx, expected_idx in zip(results.index, expected_index): + for given_idx, expected_idx in zip(results.index, expected_index, strict=False): assert given_idx, expected_idx # Check values diff --git a/tests/test_word_embedding_model.py b/tests/test_word_embedding_model.py index 14e6ff7..651d998 100644 --- a/tests/test_word_embedding_model.py +++ b/tests/test_word_embedding_model.py @@ -1,10 +1,10 @@ """Unit tests for the WordEmbeddingModel class from wefe.word_embedding_model.""" import gensim -import numpy as np -import pytest from gensim.models import FastText, KeyedVectors, Word2Vec from gensim.test.utils import common_texts +import numpy as np +import pytest from wefe.word_embedding_model import GENSIM_V4_OR_GREATER, WordEmbeddingModel diff --git a/wefe/_version.py b/wefe/_version.py index 3d26edf..5becc17 100644 --- a/wefe/_version.py +++ b/wefe/_version.py @@ -1 +1 @@ -__version__ = "0.4.1" +__version__ = "1.0.0" diff --git a/wefe/datasets/datasets.py b/wefe/datasets/datasets.py index cb8276b..3df4629 100644 --- a/wefe/datasets/datasets.py +++ b/wefe/datasets/datasets.py @@ -1,16 +1,14 @@ """Module with functions to load datasets and sets of words related to bias.""" +from importlib import resources import json import logging -import socket import time import urllib.error import urllib.request -from typing import Union import numpy as np import pandas as pd -import pkg_resources def _retry_request(func, *args, n_retries: int = 3, **kwargs): @@ -44,6 +42,7 @@ def _retry_request(func, *args, n_retries: int = 3, **kwargs): with exponential backoff - Timeout errors (socket.timeout, TimeoutError, OSError) with exponential backoff - Other exceptions with a fixed 1-second delay + """ last_exception = None @@ -67,7 +66,7 @@ def _retry_request(func, *args, n_retries: int = 3, **kwargs): continue # For non-rate-limiting errors, don't retry raise e - except (socket.timeout, TimeoutError, OSError) as e: + except (TimeoutError, OSError) as e: last_exception = e # Handle timeout errors with retry if attempt < n_retries: @@ -256,7 +255,7 @@ def fetch_eds( return word_sets_dict -def fetch_debiaswe(n_retries: int = 3) -> dict[str, Union[list[str], list]]: +def fetch_debiaswe(n_retries: int = 3) -> dict[str, list[str] | list]: """Fetch the word sets used in the paper Man is to Computer Programmer as Woman is to Homemaker? from the source. It includes gender (male, female) terms and related word sets. @@ -345,24 +344,17 @@ def load_bingliu() -> dict[str, list[str]]: A dictionary with the positive and negative words. """ - # extract the file - resource_package = __name__ - resource_neg_path = "/".join(("data", "negative-words.txt")) - bingliu_neg_bytes = pkg_resources.resource_stream( - resource_package, resource_neg_path - ) - - resource_pos_path = "/".join(("data", "positive-words.txt")) - bingliu_pos_bytes = pkg_resources.resource_stream( - resource_package, resource_pos_path - ) - - negative_words = [ - word.decode("latin-1").strip() for word in bingliu_neg_bytes.readlines() - ][31:] - positive_words = [ - word.decode("latin-1").strip() for word in bingliu_pos_bytes.readlines() - ][30:] + # Read negative words file + with resources.open_text( + "wefe.datasets.data", "negative-words.txt", encoding="latin-1" + ) as neg_file: + negative_words = [word.strip() for word in neg_file.readlines()][31:] + + # Read positive words file + with resources.open_text( + "wefe.datasets.data", "positive-words.txt", encoding="latin-1" + ) as pos_file: + positive_words = [word.strip() for word in pos_file.readlines()][30:] return { "positive_words": positive_words, @@ -370,7 +362,7 @@ def load_bingliu() -> dict[str, list[str]]: } -def fetch_debias_multiclass(n_retries: int = 3) -> dict[str, Union[list[str], list]]: +def fetch_debias_multiclass(n_retries: int = 3) -> dict[str, list[str] | list]: """Fetch the word sets used in the paper Black Is To Criminals as Caucasian Is To Police: Detecting And Removing Multiclass Bias In Word Embeddings. @@ -590,8 +582,5 @@ def load_weat() -> dict[str, list[str]]: A dictionary with the word sets. """ # noqa: D205 - resource_package = __name__ - resource_path = "/".join(("data", "WEAT.json")) - weat_data = pkg_resources.resource_string(resource_package, resource_path) - - return json.loads(weat_data.decode()) + weat_data = resources.read_text("wefe.datasets.data", "WEAT.json") + return json.loads(weat_data) diff --git a/wefe/debias/base_debias.py b/wefe/debias/base_debias.py index d91e3c0..c014fc4 100644 --- a/wefe/debias/base_debias.py +++ b/wefe/debias/base_debias.py @@ -1,12 +1,13 @@ """Contains a base class for implement any debias method in WEFE.""" from abc import abstractmethod -from typing import Optional, Union + +from sklearn.base import BaseEstimator from wefe.word_embedding_model import WordEmbeddingModel -class BaseDebias: +class BaseDebias(BaseEstimator): """Mixin class for implement any debias method in WEFE.""" # The name of the method. @@ -35,8 +36,8 @@ def fit( def transform( self, model: WordEmbeddingModel, - target: Optional[list[str]] = None, - ignore: Optional[list[str]] = None, + target: list[str] | None = None, + ignore: list[str] | None = None, copy: bool = True, ) -> WordEmbeddingModel: """Perform the debiasing method over the model provided. @@ -74,8 +75,8 @@ def transform( def fit_transform( self, model: WordEmbeddingModel, - target: Optional[list[str]] = None, - ignore: Optional[list[str]] = None, + target: list[str] | None = None, + ignore: list[str] | None = None, copy: bool = True, **fit_params, ) -> WordEmbeddingModel: @@ -116,8 +117,8 @@ def fit_transform( def _check_transform_args( self, model: WordEmbeddingModel, - target: Optional[list[str]] = None, - ignore: Optional[list[str]] = None, + target: list[str] | None = None, + ignore: list[str] | None = None, copy: bool = True, ) -> None: # check if model is a WordEmbeddingModel @@ -164,7 +165,7 @@ def _check_sets_sizes( self, sets: list[list[str]], set_name: str, - set_size: Union[int, str], + set_size: int | str, ) -> None: if len(sets) == 0: raise ValueError("") diff --git a/wefe/debias/double_hard_debias.py b/wefe/debias/double_hard_debias.py index 2b0a6af..805b0ef 100644 --- a/wefe/debias/double_hard_debias.py +++ b/wefe/debias/double_hard_debias.py @@ -1,8 +1,8 @@ """Double Hard Debias WEFE implementation.""" -import operator from copy import deepcopy -from typing import Any, Optional +import operator +from typing import Any import numpy as np from sklearn.cluster import KMeans @@ -102,7 +102,7 @@ def __init__( self, pca_args: dict[str, Any] = {"n_components": 10}, verbose: bool = False, - criterion_name: Optional[str] = None, + criterion_name: str | None = None, incremental_pca: bool = True, n_words: int = 1000, n_components: int = 4, @@ -208,7 +208,7 @@ def _bias_by_projection( ) words = list(model.vocab.keys()) - similarities = dict(zip(words, similarities_vectors)) + similarities = dict(zip(words, similarities_vectors, strict=False)) for word in ignore: if word in similarities: similarities.pop(word) @@ -360,7 +360,10 @@ def _kmeans_eval( ] kmeans = KMeans(n_cluster).fit(embeddings) y_pred = kmeans.predict(embeddings) - correct = [1 if item1 == item2 else 0 for (item1, item2) in zip(y_true, y_pred)] + correct = [ + 1 if item1 == item2 else 0 + for (item1, item2) in zip(y_true, y_pred, strict=False) + ] alignment_score = sum(correct) / float(len(correct)) alignment_score = max(alignment_score, 1 - alignment_score) return alignment_score @@ -439,8 +442,8 @@ def fit( def transform( self, model: WordEmbeddingModel, - target: Optional[list[str]] = None, - ignore: Optional[list[str]] = [], + target: list[str] | None = None, + ignore: list[str] | None = [], copy: bool = True, ) -> WordEmbeddingModel: """Execute hard debias over the provided model. diff --git a/wefe/debias/half_sibling_regression.py b/wefe/debias/half_sibling_regression.py index 5e7badb..cdc32c4 100644 --- a/wefe/debias/half_sibling_regression.py +++ b/wefe/debias/half_sibling_regression.py @@ -1,7 +1,6 @@ """Half Sibling Regression WEFE implementation.""" from copy import deepcopy -from typing import Optional import numpy as np from tqdm import tqdm @@ -130,7 +129,7 @@ class HalfSiblingRegression(BaseDebias): def __init__( self, verbose: bool = False, - criterion_name: Optional[str] = None, + criterion_name: str | None = None, ) -> None: """Initialize a Half Sibling Regression Debias instance. @@ -257,8 +256,8 @@ def fit( def transform( self, model: WordEmbeddingModel, - target: Optional[list[str]] = None, - ignore: Optional[list[str]] = None, + target: list[str] | None = None, + ignore: list[str] | None = None, copy: bool = True, ) -> WordEmbeddingModel: """Substracts the gender information from vectors. @@ -338,7 +337,7 @@ def transform( bias_info = self.bias_information[:, indexes] vectors = np.asarray(list(self.non_bias_dict.values())).T[:, indexes] debiased_vectors = self._subtract_bias_information(vectors, bias_info).T - self.non_bias_dict = dict(zip(target, debiased_vectors)) + self.non_bias_dict = dict(zip(target, debiased_vectors, strict=False)) # if target and ignores are not provided the debias is applied to # all non bias vectors @@ -347,7 +346,9 @@ def transform( debiased_vectors = self._subtract_bias_information( vectors, self.bias_information ).T - self.non_bias_dict = dict(zip(self.non_bias_dict.keys(), debiased_vectors)) + self.non_bias_dict = dict( + zip(self.non_bias_dict.keys(), debiased_vectors, strict=False) + ) if self.verbose: print("Updating debiased vectors") diff --git a/wefe/debias/hard_debias.py b/wefe/debias/hard_debias.py index 2290957..0ac0192 100644 --- a/wefe/debias/hard_debias.py +++ b/wefe/debias/hard_debias.py @@ -1,8 +1,8 @@ """Hard Debias WEFE implementation.""" -import logging from copy import deepcopy -from typing import Any, Optional +import logging +from typing import Any import numpy as np from sklearn.decomposition import PCA @@ -156,7 +156,7 @@ def __init__( self, pca_args: dict[str, Any] = {"n_components": 10}, verbose: bool = False, - criterion_name: Optional[str] = None, + criterion_name: str | None = None, ) -> None: """Initialize a Hard Debias instance. @@ -243,8 +243,8 @@ def _neutralize( self, model: WordEmbeddingModel, bias_direction: np.ndarray, - target: Optional[list[str]], - ignore: Optional[list[str]], + target: list[str] | None, + ignore: list[str] | None, ) -> None: target_ = set(target) if target is not None else set(model.vocab.keys()) @@ -297,7 +297,7 @@ def fit( self, model: WordEmbeddingModel, definitional_pairs: list[list[str]], - equalize_pairs: Optional[list[list[str]]] = None, + equalize_pairs: list[list[str]] | None = None, **fit_params, ) -> BaseDebias: """Compute the bias direction and obtains the equalize embedding pairs. @@ -393,8 +393,8 @@ def fit( def transform( self, model: WordEmbeddingModel, - target: Optional[list[str]] = None, - ignore: Optional[list[str]] = None, + target: list[str] | None = None, + ignore: list[str] | None = None, copy: bool = True, ) -> WordEmbeddingModel: """Execute hard debias over the provided model. diff --git a/wefe/debias/multiclass_hard_debias.py b/wefe/debias/multiclass_hard_debias.py index a3c9317..cfaf451 100644 --- a/wefe/debias/multiclass_hard_debias.py +++ b/wefe/debias/multiclass_hard_debias.py @@ -1,8 +1,8 @@ """Multiclass Hard Debias WEFE implementation.""" -import logging from copy import deepcopy -from typing import Any, Optional +import logging +from typing import Any import numpy as np from sklearn.decomposition import PCA @@ -79,7 +79,7 @@ def __init__( self, pca_args: dict[str, Any] = {"n_components": 10}, verbose: bool = False, - criterion_name: Optional[str] = None, + criterion_name: str | None = None, ) -> None: """Initialize a Multiclass Hard Debias instance. @@ -157,8 +157,8 @@ def _neutralize( self, model: WordEmbeddingModel, bias_subspace: np.ndarray, - target: Optional[list[str]], - ignore: Optional[list[str]], + target: list[str] | None, + ignore: list[str] | None, ) -> None: target_ = set(target) if target is not None else set(model.vocab.keys()) @@ -197,7 +197,7 @@ def _equalize( # discard the projection from the mean upsilon = mean - mean_b - for word, embedding in zip(words, embeddings): + for word, embedding in zip(words, embeddings, strict=False): v_b = self._project_onto_subspace(embedding, bias_subspace) frac = (v_b - mean_b) / np.linalg.norm(v_b - mean_b) new_v = upsilon + np.sqrt(1 - np.sum(np.square(upsilon))) * frac @@ -279,8 +279,8 @@ def fit( def transform( self, model: WordEmbeddingModel, - target: Optional[list[str]] = None, - ignore: Optional[list[str]] = None, + target: list[str] | None = None, + ignore: list[str] | None = None, copy: bool = True, ) -> WordEmbeddingModel: """Execute Multiclass Hard Debias over the provided model. diff --git a/wefe/debias/repulsion_attraction_neutralization.py b/wefe/debias/repulsion_attraction_neutralization.py index ad7439e..3cb6830 100644 --- a/wefe/debias/repulsion_attraction_neutralization.py +++ b/wefe/debias/repulsion_attraction_neutralization.py @@ -2,7 +2,7 @@ from collections.abc import Sequence from copy import deepcopy -from typing import Any, Optional +from typing import Any import numpy as np from sklearn.decomposition import PCA @@ -276,7 +276,7 @@ def __init__( self, pca_args: dict[str, Any] = {"n_components": 10}, verbose: bool = False, - criterion_name: Optional[str] = None, + criterion_name: str | None = None, epochs: int = 300, theta: float = 0.05, n_neighbours: int = 100, @@ -402,7 +402,7 @@ def _get_neighbours( self, model: WordEmbeddingModel, word: str, n_neighbours: int ) -> list[str]: similar_words = model.wv.most_similar(positive=word, topn=n_neighbours) - similar_words = list(list(zip(*similar_words))[0]) + similar_words = list(list(zip(*similar_words, strict=False))[0]) return similar_words def _get_repulsion_set( @@ -537,8 +537,8 @@ def fit( def transform( self, model: WordEmbeddingModel, - target: Optional[list[str]] = None, - ignore: Optional[list[str]] = [], + target: list[str] | None = None, + ignore: list[str] | None = [], copy: bool = True, ) -> WordEmbeddingModel: """Execute Repulsion Attraction Neutralization Debias over the provided model. diff --git a/wefe/metrics/ECT.py b/wefe/metrics/ECT.py index 20d71d3..ec230fe 100644 --- a/wefe/metrics/ECT.py +++ b/wefe/metrics/ECT.py @@ -1,6 +1,7 @@ """Embedding Coherence Test metric implementation.""" -from typing import Any, Callable, Union +from collections.abc import Callable +from typing import Any import numpy as np from scipy.spatial.distance import cosine @@ -52,7 +53,7 @@ def run_query( query: Query, model: WordEmbeddingModel, lost_vocabulary_threshold: float = 0.2, - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, warn_not_found_words: bool = False, diff --git a/wefe/metrics/MAC.py b/wefe/metrics/MAC.py index 5004b62..ebad33d 100644 --- a/wefe/metrics/MAC.py +++ b/wefe/metrics/MAC.py @@ -1,6 +1,7 @@ """Mean Average Cosine Similarity (MAC) implementation.""" -from typing import Any, Callable, Union +from collections.abc import Callable +from typing import Any import numpy as np from scipy.spatial import distance @@ -106,7 +107,7 @@ def run_query( query: Query, model: WordEmbeddingModel, lost_vocabulary_threshold: float = 0.2, - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, warn_not_found_words: bool = False, diff --git a/wefe/metrics/RIPA.py b/wefe/metrics/RIPA.py index a974784..2f752e3 100644 --- a/wefe/metrics/RIPA.py +++ b/wefe/metrics/RIPA.py @@ -1,6 +1,7 @@ """Relational Inner Product Association Test.""" -from typing import Any, Callable, Union +from collections.abc import Callable +from typing import Any import numpy as np @@ -147,7 +148,7 @@ def run_query( query: Query, model: WordEmbeddingModel, lost_vocabulary_threshold: float = 0.2, - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, warn_not_found_words: bool = False, diff --git a/wefe/metrics/RND.py b/wefe/metrics/RND.py index 4b6d258..0eb447b 100644 --- a/wefe/metrics/RND.py +++ b/wefe/metrics/RND.py @@ -1,6 +1,7 @@ """Relative Norm Distance (RND) metric implementation.""" -from typing import Any, Callable, Union +from collections.abc import Callable +from typing import Any import numpy as np from sklearn.metrics.pairwise import cosine_similarity @@ -109,7 +110,7 @@ def run_query( model: WordEmbeddingModel, distance: str = "norm", lost_vocabulary_threshold: float = 0.2, - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, warn_not_found_words: bool = False, diff --git a/wefe/metrics/RNSB.py b/wefe/metrics/RNSB.py index 2a500ef..2853d5e 100644 --- a/wefe/metrics/RNSB.py +++ b/wefe/metrics/RNSB.py @@ -1,7 +1,8 @@ """Relative Negative Sentiment Bias (RNSB) metric implementation.""" +from collections.abc import Callable import logging -from typing import Any, Callable, Union +from typing import Any import numpy as np import pandas as pd @@ -94,7 +95,7 @@ def _train_classifier( attribute_embeddings_dict: list[dict[str, np.ndarray]], estimator: BaseEstimator, estimator_params: dict[str, Any], - random_state: Union[int, None], + random_state: int | None, holdout: bool, print_model_evaluation: bool, ) -> tuple[BaseEstimator, float]: @@ -270,7 +271,7 @@ def _calc_rnsb( # set the probabilities for each word in a dict. negative_sentiment_probabilities = dict( - zip(flatten_target_words, negative_probabilities) + zip(flatten_target_words, negative_probabilities, strict=False) ) return kl_divergence, negative_sentiment_probabilities @@ -282,11 +283,11 @@ def run_query( estimator: BaseEstimator = LogisticRegression, estimator_params: dict[str, Any] = {"solver": "liblinear", "max_iter": 10000}, n_iterations: int = 1, - random_state: Union[int, None] = None, + random_state: int | None = None, holdout: bool = True, print_model_evaluation: bool = False, lost_vocabulary_threshold: float = 0.2, - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, warn_not_found_words: bool = False, diff --git a/wefe/metrics/WEAT.py b/wefe/metrics/WEAT.py index 1120999..72b3c20 100644 --- a/wefe/metrics/WEAT.py +++ b/wefe/metrics/WEAT.py @@ -1,8 +1,9 @@ """Word Embedding Assosiation Test (WEAT) metric implementation.""" +from collections.abc import Callable import logging import math -from typing import Any, Callable, Union +from typing import Any import numpy as np from sklearn.metrics.pairwise import cosine_similarity @@ -172,7 +173,7 @@ def _calc_p_value( ) test_function = TEST_FUNCTION_DISPATCHER[test_type] - if not isinstance(iterations, (int, float)): + if not isinstance(iterations, int | float): raise TypeError( f"p value iterations should be int instance, got {iterations}." ) @@ -252,7 +253,7 @@ def run_query( p_value_iterations: int = 10000, p_value_verbose: bool = False, lost_vocabulary_threshold: float = 0.2, - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, warn_not_found_words: bool = False, diff --git a/wefe/metrics/base_metric.py b/wefe/metrics/base_metric.py index 79a0790..7056bdd 100644 --- a/wefe/metrics/base_metric.py +++ b/wefe/metrics/base_metric.py @@ -1,7 +1,8 @@ """Base metric class that all metrics must extend..""" from abc import ABC, abstractmethod -from typing import Any, Callable, ClassVar, Union +from collections.abc import Callable +from typing import Any, ClassVar from wefe.query import Query from wefe.word_embedding_model import WordEmbeddingModel @@ -33,7 +34,7 @@ class BaseMetric(ABC): """ # These attributes MUST be overridden by any class that extends BaseMetric. - metric_template: ClassVar[tuple[Union[int, str], Union[int, str]]] + metric_template: ClassVar[tuple[int | str, int | str]] metric_name: ClassVar[str] metric_short_name: ClassVar[str] diff --git a/wefe/metrics/example_metric.py b/wefe/metrics/example_metric.py index 1a12830..9b015d4 100644 --- a/wefe/metrics/example_metric.py +++ b/wefe/metrics/example_metric.py @@ -1,6 +1,7 @@ """An example of how to implement metrics in WEFE.""" -from typing import Any, Callable, Union +from collections.abc import Callable +from typing import Any import numpy as np from scipy.spatial import distance @@ -75,7 +76,7 @@ def run_query( # any parameter that you need # ..., lost_vocabulary_threshold: float = 0.2, - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, warn_not_found_words: bool = False, diff --git a/wefe/preprocessing.py b/wefe/preprocessing.py index 10f904b..4e16e1b 100644 --- a/wefe/preprocessing.py +++ b/wefe/preprocessing.py @@ -1,8 +1,7 @@ """Module with utilities that ease the transformation of word sets to embeddings.""" +from collections.abc import Callable, Sequence import logging -from collections.abc import Sequence -from typing import Callable, Optional, Union import numpy as np from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicode @@ -16,8 +15,8 @@ def preprocess_word( word: str, - options: dict[str, Union[str, bool, Callable]] = {}, - vocab_prefix: Optional[str] = None, + options: dict[str, str | bool | Callable] = {}, + vocab_prefix: str | None = None, ) -> str: """pre-processes a word before it is searched in the model's vocabulary. @@ -87,7 +86,7 @@ def preprocess_word( def get_embeddings_from_set( model: WordEmbeddingModel, word_set: Sequence[str], - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, verbose: bool = False, @@ -166,7 +165,7 @@ def get_embeddings_from_set( if not isinstance(model, WordEmbeddingModel): raise TypeError(f"model should be a WordEmbeddingModel instance, got {model}.") - if not isinstance(word_set, (list, tuple, np.ndarray)): + if not isinstance(word_set, list | tuple | np.ndarray): raise TypeError( f"word_set should be a list, tuple or np.array of strings, got {word_set}." ) @@ -254,7 +253,7 @@ def _check_lost_vocabulary_threshold( word_set_name: str, lost_vocabulary_threshold: float, ) -> None: - if not isinstance(lost_vocabulary_threshold, (float, np.floating)): + if not isinstance(lost_vocabulary_threshold, float | np.floating): raise TypeError( "lost_vocabulary_threshold should be float, " f"got {lost_vocabulary_threshold}." @@ -280,8 +279,8 @@ def _check_lost_vocabulary_threshold( def get_embeddings_from_tuples( model: WordEmbeddingModel, sets: Sequence[Sequence[str]], - sets_name: Union[str, None] = None, - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + sets_name: str | None = None, + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, discard_incomplete_sets: bool = True, @@ -366,14 +365,14 @@ def get_embeddings_from_tuples( and as values their associated embeddings. """ - if not isinstance(sets, (list, tuple, np.ndarray)): + if not isinstance(sets, list | tuple | np.ndarray): raise TypeError( "sets should be a sequence of sequences (list, tuple or np.array) " f"of strings, got: {type(sets)}." ) for idx, set_ in enumerate(sets): - if not isinstance(set_, (list, tuple, np.ndarray)): + if not isinstance(set_, list | tuple | np.ndarray): raise TypeError( "Every set in sets should be a list, tuple or np.array of " f"strings, got in index {idx}: {type(set_)}" @@ -445,12 +444,12 @@ def get_embeddings_from_query( model: WordEmbeddingModel, query: Query, lost_vocabulary_threshold: float = 0.2, - preprocessors: list[dict[str, Union[str, bool, Callable]]] = [{}], + preprocessors: list[dict[str, str | bool | Callable]] = [{}], strategy: str = "first", normalize: bool = False, warn_not_found_words: bool = False, verbose: bool = False, -) -> Union[tuple[EmbeddingSets, EmbeddingSets], None]: +) -> tuple[EmbeddingSets, EmbeddingSets] | None: """Obtain the word vectors associated with the provided Query. The words that does not appears in the word embedding pretrained model @@ -541,7 +540,9 @@ def get_embeddings_from_query( # -------------------------------------------------------------------- # get target sets embeddings - for target_set, target_set_name in zip(query.target_sets, query.target_sets_names): + for target_set, target_set_name in zip( + query.target_sets, query.target_sets_names, strict=False + ): not_found_words, obtained_embeddings = get_embeddings_from_set( model=model, word_set=target_set, @@ -572,7 +573,7 @@ def get_embeddings_from_query( # -------------------------------------------------------------------- # get attribute sets embeddings for attribute_set, attribute_set_name in zip( - query.attribute_sets, query.attribute_sets_names + query.attribute_sets, query.attribute_sets_names, strict=False ): not_found_words, obtained_embeddings = get_embeddings_from_set( model=model, diff --git a/wefe/query.py b/wefe/query.py index 946a403..67b3cb3 100644 --- a/wefe/query.py +++ b/wefe/query.py @@ -1,7 +1,7 @@ """Module that implements the Query object.""" from itertools import combinations -from typing import Any, Union +from typing import Any import numpy as np @@ -13,8 +13,8 @@ def __init__( self, target_sets: list[Any], attribute_sets: list[Any], - target_sets_names: Union[list[str], None] = None, - attribute_sets_names: Union[list[str], None] = None, + target_sets_names: list[str] | None = None, + attribute_sets_names: list[str] | None = None, ) -> None: """Initializes the container. It could include a name for each word set. @@ -82,12 +82,12 @@ def __init__( """ # check input type - if not isinstance(target_sets, (list, np.ndarray)): + if not isinstance(target_sets, list | np.ndarray): raise TypeError( f"target_sets must be a numpy array or list. Given: {type(target_sets)}" ) - if not isinstance(attribute_sets, (list, np.ndarray)): + if not isinstance(attribute_sets, list | np.ndarray): raise TypeError( f"attribute_sets must be a numpy array or list. " f"Given: {type(attribute_sets)}" @@ -102,7 +102,7 @@ def __init__( # check all words that target sets contains. for idx, target_set in enumerate(target_sets): - if not isinstance(target_set, (np.ndarray, list)): + if not isinstance(target_set, np.ndarray | list): raise TypeError( "Each target set must be a list or an array of strings. " f"Given: {type(target_set)} at postion {idx}" @@ -116,7 +116,7 @@ def __init__( # check all words that attribute sets contains. for idx, attribute_set in enumerate(attribute_sets): - if not isinstance(attribute_set, (np.ndarray, list)): + if not isinstance(attribute_set, np.ndarray | list): raise TypeError( "Each attribute set must be a list or an array of strings." f" Given: {type(attribute_set)} at postion {idx}" @@ -205,6 +205,7 @@ def __eq__(self, other: Any) -> bool: for target_set, other_target_set in zip( self.target_sets, other.target_sets, + strict=False, ): if target_set != other_target_set: return False @@ -212,6 +213,7 @@ def __eq__(self, other: Any) -> bool: for attribute_set, other_attribute_set in zip( self.attribute_sets, other.attribute_sets, + strict=False, ): if attribute_set != other_attribute_set: return False @@ -219,6 +221,7 @@ def __eq__(self, other: Any) -> bool: for names, other_names in zip( self.target_sets_names, other.target_sets_names, + strict=False, ): if names != other_names: return False @@ -226,6 +229,7 @@ def __eq__(self, other: Any) -> bool: for names, other_names in zip( self.attribute_sets_names, other.attribute_sets_names, + strict=False, ): if names != other_names: return False @@ -336,11 +340,11 @@ def get_subqueries(self, new_template: tuple) -> list: attribute_subset_name, ) for attribute_subset, attribute_subset_name in zip( - attribute_subsets, attribute_subsets_names + attribute_subsets, attribute_subsets_names, strict=False ) ] for target_subset, target_subset_name in zip( - target_subsets, target_subsets_names + target_subsets, target_subsets_names, strict=False ) ] diff --git a/wefe/utils.py b/wefe/utils.py index 20ce012..adb99f0 100644 --- a/wefe/utils.py +++ b/wefe/utils.py @@ -4,16 +4,16 @@ through rankings and graph these results. """ +from collections.abc import Callable import copy +from importlib import resources import logging -from typing import Callable, Union +from gensim.models.keyedvectors import KeyedVectors import numpy as np import pandas as pd -import pkg_resources import plotly.express as px import plotly.graph_objects as go -from gensim.models.keyedvectors import KeyedVectors from sklearn.utils.validation import check_is_fitted as _check_is_fitted from wefe.metrics.base_metric import BaseMetric @@ -104,7 +104,7 @@ def run_queries( metric_params: dict = {}, generate_subqueries: bool = False, aggregate_results: bool = False, - aggregation_function: Union[str, Callable] = "abs_avg", + aggregation_function: str | Callable = "abs_avg", return_only_aggregation: bool = False, warn_not_found_words: bool = False, ) -> pd.DataFrame: @@ -163,7 +163,7 @@ def run_queries( # raise Exception('metric parameter must be instance of BaseMetric') # queries handling - if not isinstance(queries, (list, np.ndarray)): + if not isinstance(queries, list | np.ndarray): raise TypeError( f"queries parameter must be a list or a numpy array. given: {queries}" ) @@ -179,7 +179,7 @@ def run_queries( ) # word vectors wrappers handling - if not isinstance(models, (list, np.ndarray)): + if not isinstance(models, list | np.ndarray): raise TypeError( "word_embeddings_models parameter must be a list or a numpy array." f" given: {models}" @@ -588,12 +588,11 @@ def load_test_model() -> WordEmbeddingModel: from gensim.models import KeyedVectors # load dummy weat word vectors: + import wefe.datasets as datasets_package - resource_package = __name__ - resource_path = "/".join(("datasets", "data", "test_model.kv")) - weat_w2v_path = pkg_resources.resource_filename(resource_package, resource_path) - - test_model = KeyedVectors.load(weat_w2v_path) + test_model_path = resources.files(datasets_package) / "data" / "test_model.kv" + with resources.as_file(test_model_path) as weat_w2v_path: + test_model = KeyedVectors.load(str(weat_w2v_path)) return WordEmbeddingModel(test_model, "test_w2v") diff --git a/wefe/word_embedding_model.py b/wefe/word_embedding_model.py index 2050c50..bdb60a7 100644 --- a/wefe/word_embedding_model.py +++ b/wefe/word_embedding_model.py @@ -5,8 +5,8 @@ import gensim import numpy as np -import semantic_version from numpy.typing import NDArray +import semantic_version GENSIM_VERSION = semantic_version.Version.coerce(gensim.__version__) GENSIM_V4_OR_GREATER = GENSIM_VERSION.major >= 4 # type: ignore @@ -383,12 +383,12 @@ def batch_update( """ # Initial type and length validation for the input containers - if not isinstance(words, (list, tuple, np.ndarray)): + if not isinstance(words, list | tuple | np.ndarray): raise TypeError( f"words argument should be a list, tuple or np.array of strings, " f"but got {type(words)}." ) - if not isinstance(embeddings, (list, tuple, np.ndarray)): + if not isinstance(embeddings, list | tuple | np.ndarray): raise TypeError( "embeddings argument should be a list, tuple or np.array of " f"NumPy arrays, but got {type(embeddings)}."