diff --git a/.github/workflows/pkg_index.yml b/.github/workflows/pkg_index.yml new file mode 100644 index 0000000..7955b98 --- /dev/null +++ b/.github/workflows/pkg_index.yml @@ -0,0 +1,40 @@ +name: Build and commit index.xml on package update + +on: + push: + branches: + - gh-pages + paths: + - 'packages/**' + +jobs: + build-index: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install dependencies + run: pip install nltk + + - name: Install make + run: sudo apt-get update && sudo apt-get install -y make + + - name: Build index.xml + run: make pkg_index + + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Commit and push index.xml + run: | + git add index.xml + git commit -m "Auto-build index.xml after package update" || echo "No changes to commit" + git push \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..67aec2a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,81 @@ +# Contributing to nltk_data + +Thank you for your interest in contributing to [`nltk_data`](https://github.com/nltk/nltk_data)! This guide will help you add new data packages (corpora, taggers, models, etc.) and contribute improvements to existing ones. + +## Adding a New Data Package + +The `nltk_data` repository contains datasets and resources that can be downloaded by `nltk.downloader`. To add a new dataset or resource, please follow these steps: + +### 1. Fork and Clone the Repository + +First, fork the [`nltk_data`](https://github.com/nltk/nltk_data) repository to your own GitHub account. For help with forking, see the [GitHub documentation on forking a repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo). + +Then, clone your fork locally: + +```bash +git clone https://github.com//nltk_data.git +cd nltk_data +``` + +### 2. Create a New Branch + +Create a branch for your dataset: + +```bash +git checkout -b add-my-dataset +``` + +### 3. Add Your Data Package + +- Place your dataset in the appropriate directory (`corpora/`, `models/`, `tokenizers/`, etc.). If you are unsure, check the existing structure or open an issue for clarification. +- If your dataset has a license, include the license file in the same directory. If the license is unknown or separate from the repository, please add a note in a `README` or `LICENSE` file within the dataset’s folder, and document this in your pull request. + +**Whenever you add a new data package, you must update [`DATASET-LICENSES.md`](DATASET-LICENSES.md) with the license information for your package.** + +You only need to update [`LICENSE-OVERVIEW.md`](LICENSE-OVERVIEW.md) if you are making changes to the repository’s overall licensing structure or guidance. + +### 4. Update Index Files + +- You do **not** need to manually update `index.xml`. This file is now rebuilt automatically by a GitHub Actions workflow after your changes are merged. +- Any local changes you make to `index.xml` will be ignored and overwritten by the workflow. +- Provide a short README or metadata file describing the package, its origin, and its license. + +### 5. Commit and Push Your Changes + +```bash +git add +git commit -m "Add dataset to nltk_data" +git push origin add-my-dataset +``` + +### 6. Create a Pull Request + +Open a pull request from your branch to the `master` branch of `nltk/nltk_data`. For help, see the [GitHub documentation on creating a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request). + +In your pull request, please include: +- A description of the dataset and its purpose. +- Any relevant licensing information or restrictions. +- Instructions for any special installation or usage requirements. + +### 7. Respond to Feedback + +- Be responsive to comments and requested changes. +- If your dataset cannot be accepted (e.g., due to licensing issues), we will let you know in the pull request. + +## General Guidelines + +- **Licensing**: Please ensure you have the right to redistribute any data you submit, and document the license clearly. If the license is unknown, state this explicitly in your pull request. +- **No Large Files**: If your package is extremely large, consider hosting it elsewhere and providing an index/manifest, or open an issue to discuss options. +- **No Executable Files**: Only data, not code, should be included unless a script is essential for using the dataset. + +## Additional Resources + +- [GitHub Docs: Fork a repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo) +- [GitHub Docs: Branches](https://docs.github.com/en/get-started/quickstart/github-glossary#branch) +- [GitHub Docs: Pull Requests](https://docs.github.com/en/pull-requests) + +If you have questions or need help, please open an issue or join the [nltk-dev mailing list](https://groups.google.com/forum/#!forum/nltk-dev). + +--- + +Thank you for helping improve NLTK’s data resources! \ No newline at end of file diff --git a/DATASET-LICENSES.md b/DATASET-LICENSES.md new file mode 100644 index 0000000..978a24b --- /dev/null +++ b/DATASET-LICENSES.md @@ -0,0 +1,243 @@ +# DATASET-LICENSES.md + +This document provides a grouped summary of licenses for all data packages present in the [`nltk_data`](https://github.com/nltk/nltk_data) repository, based on the current `index.xml` file. Each package is listed by its exact `id` and `name`, and grouped by license type as declared in the metadata. + +> **Disclaimer:** +> This information is provided as a convenience to users and is not legal advice. +> **You must verify the license for each dataset with the original source if your use case is sensitive (especially for commercial or redistributive use).** +> Licenses or terms can change over time; this file may become outdated if not maintained. + +--- + +## MIT License + +- averaged_perceptron_tagger — Averaged Perceptron Tagger +- averaged_perceptron_tagger_eng — Averaged Perceptron Tagger (JSON) +- averaged_perceptron_tagger_ru — Averaged Perceptron Tagger (Russian) +- averaged_perceptron_tagger_rus — Averaged Perceptron Tagger (Russian) +- vader_lexicon — VADER Sentiment Lexicon + +--- + +## Creative Commons Licenses + +### Creative Commons Attribution 4.0 International + +- opinion_lexicon — Opinion Lexicon +- product_reviews_1 — Product Reviews (5 Products) +- product_reviews_2 — Product Reviews (9 Products) +- pros_cons — Pros and Cons +- subjectivity — Subjectivity Dataset v1.0 + +### Creative Commons Attribution 3.0 Unported License + +- framenet_v17 — FrameNet 1.7 + +### Creative Commons Attribution-NonCommercial-ShareAlike 3.0 United States + +- universal_treebanks_v20 — Universal Treebanks Version 2.0 + +### Creative Commons Attribution 3.0 (unspecified region) + +- sentiwordnet — SentiWordNet + +### CC0 1.0 Universal + +- panlex_swadesh — PanLex Swadesh Corpora + +### CC By SA 3.0 (Wiktionary) & UBY 1.0 (UBY) + +- extended_omw — Extended Open Multilingual WordNet + +--- + +## GNU Licenses + +### GNU General Public License + +- pl196x — Polish language of the XX century sixties + +### GNU Free Documentation License + +- swadesh — Swadesh Wordlists +- gazetteers — Gazetteer Lists (note: for some files only; others may be public domain) + +### GNU Lesser General Public License + +- nonbreaking_prefixes — Non-Breaking Prefixes (Moses Decoder) + +--- + +## Public Domain + +- genesis — Genesis Corpus +- gutenberg — Project Gutenberg Selections +- inaugural — C-Span Inaugural Address Corpus +- shakespeare — Shakespeare XML Corpus Sample +- udhr — Universal Declaration of Human Rights Corpus +- udhr2 — Universal Declaration of Human Rights Corpus (Unicode Version) +- words — Word Lists + +--- + +## “Distributed with Permission” / “May be used with Permission” / “Freely Redistributable” + +> **Warning:** +> These are not standard open licenses. Terms may prohibit redistribution, modification, or commercial use. +> **You must consult the upstream source for the actual terms and whether permission applies to your use case.** + +- alpino — Alpino Dutch Treebank +- indian — Indian Language POS-Tagged Corpus +- lin_thesaurus — Lin's Dependency Thesaurus +- mac_morpho — MAC-MORPHO: Brazilian Portuguese news text with part-of-speech tags +- paradigms — Paradigm Corpus +- nombank.1.0 — NomBank Corpus 1.0 +- propbank — Proposition Bank Corpus 1.0 +- senseval — SENSEVAL 2 Corpus: Sense Tagged Text +- verbnet — VerbNet Lexicon, Version 2.1 +- verbnet3 — VerbNet Lexicon, Version 3.3 +- maxent_treebank_pos_tagger — Treebank Part of Speech Tagger (Maximum entropy) +- maxent_treebank_pos_tagger_tab — Treebank Part of Speech Tagger (Maximum entropy) +- maxent_ne_chunker — ACE Named Entity Chunker (Maximum entropy) +- maxent_ne_chunker_tab — ACE Named Entity Chunker (Maximum entropy) +- pil — The Patient Information Leaflet (PIL) Corpus +- pe08 — Cross-Framework and Cross-Domain Parser Evaluation Shared Task +- kimmo — PC-KIMMO Data Files +- jeita — JEITA Public Morphologically Tagged Corpus +- knbc — KNB Corpus (Annotated blog corpus) + +--- + +## “Non-commercial Use Only” / Educational Use + +- brown — Brown Corpus +- brown_tei — Brown Corpus (TEI XML Version) +- framenet_v15 — FrameNet 1.5 +- floresta — Portuguese Treebank +- masc_tagged — MASC Tagged Corpus +- nps_chat — NPS Chat + +--- + +## “See LICENSE Files” (Aggregated/Mixed Licensing) + +> **Warning:** +> These packages include files from multiple sources, each with their own license. See LICENSE files inside the package and verify terms for your use case. + +- omw — Open Multilingual Wordnet +- omw-1.4 — Open Multilingual Wordnet + +--- + +## Special Cases, Custom, or Unique Licenses + +- bcp47 — BCP-47 Language Tags ("IETF Trust and Unicode Inc."; custom) +- wordnet — WordNet ("Permission to use, copy, modify and distribute this software and database and its documentation for any purpose and without fee or royalty") +- wordnet31 — Wordnet 3.1 (same as above) +- wordnet2021 / wordnet2022 / english_wordnet — Open English Wordnet (combines WordNet License and Creative Commons Attribution) +- twitter_samples — Twitter Samples ("Must be used subject to Twitter Developer Agreement") +- switchboard — Switchboard Corpus Sample ("Permission is granted for use of this material in accordance with the Open Content License") +- dependency_treebank — Dependency Parsed Treebank (fragment of Penn Treebank; non-commercial, no redistribution) +- ptb — Penn Treebank (stub for full corpus) +- treebank — Penn Treebank Sample (fragment; non-commercial, no redistribution) +- conll2000 — CONLL 2000 Chunking Corpus (research use only) +- conll2002 — CONLL 2002 Named Entity Recognition Corpus (see website) +- conll2007 — Dependency Treebanks from CoNLL 2007 (Catalan and Basque Subset, see website) +- ieer — NIST IE-ER DATA SAMPLE (see website) +- reuters — Reuters-21578 benchmark corpus, ApteMod version (Reuters Ltd. copyright) +- timit — TIMIT Corpus Sample (Creative Commons Attribution-NonCommercial-ShareAlike 3.0) + +--- + +## Unclarified, Unknown, Ambiguous, or Citation-Only + +The following packages have: +- No `license` attribute +- An empty or ambiguous value +- A citation request instead of a license +- Or otherwise ambiguous status + +> **Warning:** +> These packages lack open, standard, or clearly documented licenses. +> Citation requests do **not** constitute a license. +> Despite long-standing and ongoing efforts (see [nltk_data issue #241](https://github.com/nltk/nltk_data/issues/241) and related discussions), clarification has not been possible for these cases. +> **If you need to use any of these for commercial or redistributive purposes, consult a qualified legal professional.** + +- abc — Australian Broadcasting Commission 2006 +- basque_grammars — Grammars for Basque +- biocreative_ppi — BioCreAtIvE (Critical Assessment of Information Extraction Systems in Biology) +- bllip_wsj_no_aux — BLLIP Parser: WSJ Model +- book_grammars — Grammars from NLTK Book +- cess_cat — CESS-CAT Treebank (citation requested, not a license) +- cess_esp — CESS-ESP Treebank (citation requested, not a license) +- chat80 — Chat-80 Data Files +- city_database — City Database +- cmudict — The Carnegie Mellon Pronouncing Dictionary (0.6) +- comparative_sentences — Comparative Sentence Dataset (ambiguous license) +- comtrans — ComTrans Corpus Sample +- dolch — Dolch Word List +- europarl_raw — Sample European Parliament Proceedings Parallel Corpus +- framenet_v15 — FrameNet 1.5 (non-commercial use only) +- gazetteers — Gazetteer Lists (mixed per-file) +- large_grammars — Large context-free and feature-based grammars +- machado — Machado de Assis -- Obra Completa ("Public Domain", verify at source) +- moses_sample — Moses Sample Models +- mwa_ppdb — Monolingual word aligner (subset of Paraphrase Database) +- names — Names Corpus, Version 1.3 (1994-03-29) +- nonbreaking_prefixes — Non-Breaking Prefixes (empty license field) +- punkt — Punkt Tokenizer Models (no license attribute) +- punkt_tab — Punkt Tokenizer Models (no license attribute) +- porter_test — Porter Stemmer Test Files +- ppattach — Prepositional Phrase Attachment Corpus +- problem_reports — Problem Report Corpus +- qc — Experimental Data for Question Classification +- rslp — RSLP Stemmer (Removedor de Sufixos da Lingua Portuguesa) +- rte — PASCAL RTE Challenges 1, 2, and 3 +- sample_grammars — Sample Grammars +- semcor — SemCor 3.0 +- sentence_polarity — Sentence Polarity Dataset v1.0 (ambiguous license) +- smultron — SMULTRON Corpus Sample +- snowball_data — Snowball Data +- spanish_grammars — Grammars for Spanish +- state_union — C-Span State of the Union Address Corpus +- stopwords — Stopwords Corpus +- tagsets — Help on Tagsets +- tagsets_json — Help on Tagsets (JSON) +- toolbox — Toolbox Sample Files +- unicode_samples — Unicode Samples +- webtext — Web Text Corpus +- wmt15_eval — Evaluation data from WMT15 +- word2vec_sample — Word2Vec Sample +- wordnet_ic — WordNet-InfoContent +- ycoe — York-Toronto-Helsinki Parsed Corpus of Old English Prose + +--- + +## Packages with Citation Requests Instead of Licenses + +> **Note:** +> These packages specifically request citation for use, but do not provide a license. Citation requests are not a license. + +- cess_cat — CESS-CAT Treebank +- cess_esp — CESS-ESP Treebank + +--- + +## Packages Citing Source Website or “See Website” for Terms + +> **Note:** +> These packages refer users to an external website for their licensing terms. + +- conll2002 — CONLL 2002 Named Entity Recognition Corpus +- conll2007 — Dependency Treebanks from CoNLL 2007 (Catalan and Basque Subset) +- ieer — NIST IE-ER DATA SAMPLE +- reuters — The Reuters-21578 benchmark corpus, ApteMod version + +--- + +## Maintenance + +**If you add, update, or remove any data packages, update this file accordingly to ensure continued transparency and compliance.** +If you find omissions, errors, or outdated information, please open an issue or pull request. + +--- \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..cdec062 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for describing the origin of the Work and + reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/LICENSE-OVERVIEW.md b/LICENSE-OVERVIEW.md new file mode 100644 index 0000000..30d163d --- /dev/null +++ b/LICENSE-OVERVIEW.md @@ -0,0 +1,50 @@ +# LICENSE-OVERVIEW.md + +## Overview + +This repository (`nltk_data`) is governed as a whole by the [Apache License 2.0](LICENSE). However, **the individual data packages included in this repository are each subject to their own licenses**, which may differ substantially from the repository-wide license. Packages may be covered by open licenses (MIT, Creative Commons, etc.), public domain dedication, custom or restrictive terms (such as "non-commercial use only" or "distributed with permission"), or may lack explicit license terms entirely. + +> **Important:** +> You must consult the specific license for each dataset before use, especially for commercial or redistributive purposes. +> See [DATASET-LICENSES.md](DATASET-LICENSES.md) for a grouped summary of package licenses. + +Maintainers are not legal professionals and cannot answer legal questions or provide legal advice. +If you have any doubts or require legal interpretation, **consult a qualified legal professional**. + +## Special Notes + +- **Unclarified, Ambiguous, or Missing Licenses** + Some data packages have ambiguous, missing, or unclarified licenses (most notably the Punkt Tokenizer Models). Despite long-standing community efforts (see [nltk_data issue #241](https://github.com/nltk/nltk_data/issues/241) and related issues), clarification has not always been possible. + These packages are grouped and flagged in [DATASET-LICENSES.md](DATASET-LICENSES.md) with explicit warnings. + If you have legal questions or concerns about using any package with an unclear or ambiguous license, consult a qualified lawyer. Do not rely on assumptions, community answers, or advice from maintainers. + +- **This Documentation is Not Legal Advice** + The information in these files is provided for convenience and transparency, and does not constitute legal advice. + You are responsible for ensuring your own legal compliance when using, modifying, or redistributing any content from this repository. + +## Data Package Licenses + +Each data package may have its own license, as detailed in [DATASET-LICENSES.md](DATASET-LICENSES.md). These may include (but are not limited to): +- Open source licenses (MIT, various Creative Commons, GPL, etc.) +- Public domain dedication +- Custom or restrictive terms ("distributed with permission", "non-commercial use only", "see website", etc.) +- Citation requests (note: a citation request does not constitute a license) +- No license or ambiguous terms + +If a license is unclear, missing, or does not suit your intended use, **do not assume that commercial or public redistribution is allowed**. + +## Your Responsibilities + +- **Check the Dataset License:** + Before using, modifying, or redistributing any data package, check the relevant license entry in [DATASET-LICENSES.md](DATASET-LICENSES.md) and, if necessary, consult the original data source for updated terms. + +- **When in Doubt:** + If the license is missing, ambiguous, or unclear, or if you are unsure about your intended use, seek advice from a qualified legal professional. + +## Keeping This Documentation Up to Date + +If you add, update, or remove datasets, please also update [DATASET-LICENSES.md](DATASET-LICENSES.md) and this overview file to ensure continued transparency for all users. + +## Apache License 2.0 + +See the [LICENSE](LICENSE) file for the full text of the repository-wide license. \ No newline at end of file diff --git a/Makefile b/Makefile index 8a4520b..35292f4 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,10 @@ -PYTHON = python -BASEURL = http://www.nltk.org/nltk_data/packages +PYTHON = python3 +BASEURL = https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages pkg_index: + $(PYTHON) tools/build_collections.py . $(PYTHON) tools/build_pkg_index.py . $(BASEURL) index.xml - git add index.xml - git commit -m "updated data index" index.xml + git add index.xml collections grammars: git commit -m "updated grammar files" packages/grammars diff --git a/README.md b/README.md new file mode 100644 index 0000000..f1f4551 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# Data Distribution for NLTK + +This repository contains data packages (corpora, models, tokenizers, etc.) for use with [NLTK](https://www.nltk.org/). + +## Installation + +To install data using the NLTK downloader, run: + +```python +import nltk +nltk.download() +``` + +For detailed instructions, please see the [NLTK website](https://www.nltk.org/). + +--- + +## Recent Enhancements + +> **Note:** You do not need to update `index.xml` when adding or modifying packages. It is automatically rebuilt after changes are merged. + +### Licensing Transparency ([PR #242](https://github.com/nltk/nltk_data/pull/242)) +- Added a top-level `LICENSE` (Apache License 2.0) for the repository. +- Added `LICENSE-OVERVIEW.md` summarizing the licensing structure, with emphasis on the diversity of dataset licenses and the importance of reviewing individual terms. +- Added `DATASET-LICENSES.md` — a comprehensive, grouped list of all data packages and their licenses, highlighting any ambiguous or unclarified licensing. +- These changes improve transparency, support responsible use, and aid compliance for all users. + +### Contribution Guidelines +- Introduced a detailed `CONTRIBUTING.md` with step-by-step instructions for adding a new data package using Git and GitHub. +- Please see `CONTRIBUTING.md` for instructions on adding datasets and making other contributions. +- Contributors are encouraged to clarify dataset licenses and to consult the new licensing overview and dataset license table. + +--- + +*For instructions on adding new data packages, please see [CONTRIBUTING.md](CONTRIBUTING.md). For licensing details, see [LICENSE-OVERVIEW.md](LICENSE-OVERVIEW.md) and [DATASET-LICENSES.md](DATASET-LICENSES.md).* \ No newline at end of file diff --git a/README.txt b/README.txt deleted file mode 100644 index 0a1b3f0..0000000 --- a/README.txt +++ /dev/null @@ -1,7 +0,0 @@ -Data Distribution for NLTK - -Install using NLTK downloader: nltk.download() - -For instructions please see http://www.nltk.org/ - - diff --git a/collections/all-corpora.xml b/collections/all-corpora.xml index 4c3bc55..055d975 100644 --- a/collections/all-corpora.xml +++ b/collections/all-corpora.xml @@ -1,74 +1,95 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/collections/all-nltk.xml b/collections/all-nltk.xml new file mode 100644 index 0000000..b863edb --- /dev/null +++ b/collections/all-nltk.xml @@ -0,0 +1,123 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/collections/all.xml b/collections/all.xml index c194275..52ce7a7 100644 --- a/collections/all.xml +++ b/collections/all.xml @@ -1,100 +1,123 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/collections/book.xml b/collections/book.xml index c322f11..364affe 100644 --- a/collections/book.xml +++ b/collections/book.xml @@ -37,4 +37,5 @@ + diff --git a/collections/popular.xml b/collections/popular.xml new file mode 100644 index 0000000..3a736dd --- /dev/null +++ b/collections/popular.xml @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/collections/tests.xml b/collections/tests.xml new file mode 100644 index 0000000..d1509b8 --- /dev/null +++ b/collections/tests.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/collections/third-party.xml b/collections/third-party.xml new file mode 100644 index 0000000..0cb74d8 --- /dev/null +++ b/collections/third-party.xml @@ -0,0 +1,3 @@ + + + diff --git a/index.xml b/index.xml index afde054..20ea2b3 100644 --- a/index.xml +++ b/index.xml @@ -2,110 +2,141 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + @@ -113,14 +144,20 @@ + + + + + + @@ -130,54 +167,91 @@ + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + @@ -193,9 +267,13 @@ + + + + @@ -209,32 +287,35 @@ - + + + - + + + - - - + + - + @@ -252,28 +333,138 @@ + + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + - - + + + + + - - + + + + + + + + + + + + + + + - - + + + + + + + @@ -314,6 +505,55 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - \ No newline at end of file + diff --git a/packages/chunkers/maxent_ne_chunker_tab.xml b/packages/chunkers/maxent_ne_chunker_tab.xml new file mode 100644 index 0000000..14f7d2d --- /dev/null +++ b/packages/chunkers/maxent_ne_chunker_tab.xml @@ -0,0 +1,5 @@ + diff --git a/packages/chunkers/maxent_ne_chunker_tab.zip b/packages/chunkers/maxent_ne_chunker_tab.zip new file mode 100644 index 0000000..1dfcfa0 Binary files /dev/null and b/packages/chunkers/maxent_ne_chunker_tab.zip differ diff --git a/packages/chunkers/muc6.chunk.tagger.pickle.gz b/packages/chunkers/muc6.chunk.tagger.pickle.gz deleted file mode 100644 index 60c2b9a..0000000 Binary files a/packages/chunkers/muc6.chunk.tagger.pickle.gz and /dev/null differ diff --git a/packages/chunkers/treebank.chunker.pickle.gz b/packages/chunkers/treebank.chunker.pickle.gz deleted file mode 100644 index b39b2ee..0000000 Binary files a/packages/chunkers/treebank.chunker.pickle.gz and /dev/null differ diff --git a/packages/corpora/alpino.xml b/packages/corpora/alpino.xml index 9af207b..0679e79 100644 --- a/packages/corpora/alpino.xml +++ b/packages/corpora/alpino.xml @@ -1,6 +1,6 @@ diff --git a/packages/corpora/alpino.zip b/packages/corpora/alpino.zip index 769873e..7c7cd2f 100644 Binary files a/packages/corpora/alpino.zip and b/packages/corpora/alpino.zip differ diff --git a/packages/corpora/bcp47.xml b/packages/corpora/bcp47.xml new file mode 100644 index 0000000..96e2fed --- /dev/null +++ b/packages/corpora/bcp47.xml @@ -0,0 +1,6 @@ + diff --git a/packages/corpora/bcp47.zip b/packages/corpora/bcp47.zip new file mode 100644 index 0000000..516a294 Binary files /dev/null and b/packages/corpora/bcp47.zip differ diff --git a/packages/corpora/dolch.xml b/packages/corpora/dolch.xml new file mode 100644 index 0000000..0968e8b --- /dev/null +++ b/packages/corpora/dolch.xml @@ -0,0 +1,4 @@ + diff --git a/packages/corpora/dolch.zip b/packages/corpora/dolch.zip new file mode 100644 index 0000000..940e75f Binary files /dev/null and b/packages/corpora/dolch.zip differ diff --git a/packages/corpora/english_wordnet.xml b/packages/corpora/english_wordnet.xml new file mode 100644 index 0000000..3866213 --- /dev/null +++ b/packages/corpora/english_wordnet.xml @@ -0,0 +1,7 @@ + diff --git a/packages/corpora/english_wordnet.zip b/packages/corpora/english_wordnet.zip new file mode 100644 index 0000000..b8e583a Binary files /dev/null and b/packages/corpora/english_wordnet.zip differ diff --git a/packages/corpora/extended_omw.xml b/packages/corpora/extended_omw.xml new file mode 100644 index 0000000..b908230 --- /dev/null +++ b/packages/corpora/extended_omw.xml @@ -0,0 +1,6 @@ + diff --git a/packages/corpora/extended_omw.zip b/packages/corpora/extended_omw.zip new file mode 100644 index 0000000..4648909 Binary files /dev/null and b/packages/corpora/extended_omw.zip differ diff --git a/packages/corpora/framenet_v17.xml b/packages/corpora/framenet_v17.xml new file mode 100644 index 0000000..92fada7 --- /dev/null +++ b/packages/corpora/framenet_v17.xml @@ -0,0 +1,6 @@ + diff --git a/packages/corpora/framenet_v17.zip b/packages/corpora/framenet_v17.zip new file mode 100644 index 0000000..f7a5670 Binary files /dev/null and b/packages/corpora/framenet_v17.zip differ diff --git a/packages/corpora/gutenberg.xml b/packages/corpora/gutenberg.xml index ae9906a..439c04f 100644 --- a/packages/corpora/gutenberg.xml +++ b/packages/corpora/gutenberg.xml @@ -1,5 +1,5 @@ diff --git a/packages/corpora/mock_corpus.zip b/packages/corpora/mock_corpus.zip new file mode 100644 index 0000000..9be98e8 Binary files /dev/null and b/packages/corpora/mock_corpus.zip differ diff --git a/packages/corpora/mte_teip5.xml b/packages/corpora/mte_teip5.xml index aaba382..1715b6b 100644 --- a/packages/corpora/mte_teip5.xml +++ b/packages/corpora/mte_teip5.xml @@ -2,5 +2,5 @@ author="Erjavec, Tomaž; Barbu, Ana-Maria; Derzhanski, Ivan; Dimitrova, Ludmila; Garabík, Radovan; Ide, Nancy; Kaalep, Heiki-Jaan; Kotsyba, Natalia; Krstev, Cvetana; Oravecz, Csaba; Petkevič, Vladimír; Priest-Dorman, Greg; QasemiZadeh, Behrang; Radziszewski, Adam; Simov, Kiril; Tufiş, Dan and Zdravkova, Katerina" license="Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)" webpage="https://www.clarin.si/repository/xmlui/handle/11356/1043" - unzip="0" + unzip="1" /> diff --git a/packages/corpora/nombank.1.0.zip b/packages/corpora/nombank.1.0.zip index b756bb5..f44d852 100644 Binary files a/packages/corpora/nombank.1.0.zip and b/packages/corpora/nombank.1.0.zip differ diff --git a/packages/corpora/nonbreaking_prefixes.xml b/packages/corpora/nonbreaking_prefixes.xml new file mode 100644 index 0000000..57d71bd --- /dev/null +++ b/packages/corpora/nonbreaking_prefixes.xml @@ -0,0 +1,5 @@ + diff --git a/packages/corpora/nonbreaking_prefixes.zip b/packages/corpora/nonbreaking_prefixes.zip new file mode 100644 index 0000000..d9d21e5 Binary files /dev/null and b/packages/corpora/nonbreaking_prefixes.zip differ diff --git a/packages/corpora/oanc_masc.xml b/packages/corpora/oanc_masc.xml deleted file mode 100644 index 20871f2..0000000 --- a/packages/corpora/oanc_masc.xml +++ /dev/null @@ -1,4 +0,0 @@ - diff --git a/packages/corpora/omw-1.4.xml b/packages/corpora/omw-1.4.xml new file mode 100644 index 0000000..64ea793 --- /dev/null +++ b/packages/corpora/omw-1.4.xml @@ -0,0 +1,7 @@ + diff --git a/packages/corpora/omw-1.4.zip b/packages/corpora/omw-1.4.zip new file mode 100644 index 0000000..2bece32 Binary files /dev/null and b/packages/corpora/omw-1.4.zip differ diff --git a/packages/corpora/omw.xml b/packages/corpora/omw.xml index e5e30ee..c991a80 100755 --- a/packages/corpora/omw.xml +++ b/packages/corpora/omw.xml @@ -2,6 +2,6 @@ author="Francis Bond" license="Please consult the LICENSE files included with the individual Wordnets. Note that all permit redistribution." copyright="Please consult the copyright statements of the individual Wordnets" - webpage="http://compling.hss.ntu.edu.sg/omw/" - unzip="1" + webpage="https://omwn.org/" + unzip="0" /> diff --git a/packages/corpora/omw.zip b/packages/corpora/omw.zip index e05c754..b28c4ed 100644 Binary files a/packages/corpora/omw.zip and b/packages/corpora/omw.zip differ diff --git a/packages/corpora/panlex_swadesh.zip b/packages/corpora/panlex_swadesh.zip index 6937900..9e2e117 100644 Binary files a/packages/corpora/panlex_swadesh.zip and b/packages/corpora/panlex_swadesh.zip differ diff --git a/packages/corpora/ptb3.zip b/packages/corpora/ptb3.zip deleted file mode 100755 index a31181e..0000000 Binary files a/packages/corpora/ptb3.zip and /dev/null differ diff --git a/packages/corpora/sinica_treebank.zip b/packages/corpora/sinica_treebank.zip index c7d0107..27be13e 100644 Binary files a/packages/corpora/sinica_treebank.zip and b/packages/corpora/sinica_treebank.zip differ diff --git a/packages/corpora/stopwords.xml b/packages/corpora/stopwords.xml index 0be3d33..fabee47 100644 --- a/packages/corpora/stopwords.xml +++ b/packages/corpora/stopwords.xml @@ -1,4 +1,4 @@ diff --git a/packages/corpora/stopwords.zip b/packages/corpora/stopwords.zip index 1abac39..56d35b5 100644 Binary files a/packages/corpora/stopwords.zip and b/packages/corpora/stopwords.zip differ diff --git a/packages/corpora/treebank.zip b/packages/corpora/treebank.zip index c85fa58..42d0c65 100644 Binary files a/packages/corpora/treebank.zip and b/packages/corpora/treebank.zip differ diff --git a/packages/corpora/verbnet.xml b/packages/corpora/verbnet.xml index d4bdea1..96256ff 100644 --- a/packages/corpora/verbnet.xml +++ b/packages/corpora/verbnet.xml @@ -2,7 +2,7 @@ name="VerbNet Lexicon, Version 2.1" version="2.1" author="Karin Kipper-Schuler" - webpage="http://verbs.colorado.edu/~mpalmer/projects/verbnet.html" + webpage="https://verbs.colorado.edu/verbnet/" license="Distributed with permission of the author." unzip="1" /> diff --git a/packages/corpora/verbnet3.xml b/packages/corpora/verbnet3.xml new file mode 100644 index 0000000..bb9460f --- /dev/null +++ b/packages/corpora/verbnet3.xml @@ -0,0 +1,9 @@ + + diff --git a/packages/corpora/verbnet3.zip b/packages/corpora/verbnet3.zip new file mode 100644 index 0000000..46efce1 Binary files /dev/null and b/packages/corpora/verbnet3.zip differ diff --git a/packages/corpora/wordnet.xml b/packages/corpora/wordnet.xml index 9010417..5dbb80e 100644 --- a/packages/corpora/wordnet.xml +++ b/packages/corpora/wordnet.xml @@ -3,5 +3,5 @@ license="Permission to use, copy, modify and distribute this software and database and its documentation for any purpose and without fee or royalty is hereby granted, provided that you agree to comply with the following copyright notice and statements, including the disclaimer, and that the same appear on ALL copies of the software, database and documentation, including modifications that you make for internal use or for distribution.... [see webpage for full license]" copyright="WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved." webpage="http://wordnet.princeton.edu/" - unzip="1" + unzip="0" /> diff --git a/packages/corpora/wordnet2021.xml b/packages/corpora/wordnet2021.xml new file mode 100644 index 0000000..e6b4f97 --- /dev/null +++ b/packages/corpora/wordnet2021.xml @@ -0,0 +1,7 @@ + diff --git a/packages/corpora/wordnet2021.zip b/packages/corpora/wordnet2021.zip new file mode 100644 index 0000000..3f9b1e8 Binary files /dev/null and b/packages/corpora/wordnet2021.zip differ diff --git a/packages/corpora/wordnet2022.xml b/packages/corpora/wordnet2022.xml new file mode 100644 index 0000000..d4f57aa --- /dev/null +++ b/packages/corpora/wordnet2022.xml @@ -0,0 +1,7 @@ + diff --git a/packages/corpora/wordnet2022.zip b/packages/corpora/wordnet2022.zip new file mode 100644 index 0000000..935eee1 Binary files /dev/null and b/packages/corpora/wordnet2022.zip differ diff --git a/packages/corpora/wordnet31.xml b/packages/corpora/wordnet31.xml new file mode 100644 index 0000000..a5a2a50 --- /dev/null +++ b/packages/corpora/wordnet31.xml @@ -0,0 +1,7 @@ + diff --git a/packages/corpora/oanc_masc.zip b/packages/corpora/wordnet31.zip similarity index 56% rename from packages/corpora/oanc_masc.zip rename to packages/corpora/wordnet31.zip index c11f5a5..d1253a1 100644 Binary files a/packages/corpora/oanc_masc.zip and b/packages/corpora/wordnet31.zip differ diff --git a/packages/help/tagsets_json.xml b/packages/help/tagsets_json.xml new file mode 100644 index 0000000..2de89b7 --- /dev/null +++ b/packages/help/tagsets_json.xml @@ -0,0 +1,6 @@ + diff --git a/packages/help/tagsets_json.zip b/packages/help/tagsets_json.zip new file mode 100644 index 0000000..c6135cd Binary files /dev/null and b/packages/help/tagsets_json.zip differ diff --git a/packages/misc/mwa_ppdb.xml b/packages/misc/mwa_ppdb.xml new file mode 100644 index 0000000..60245bd --- /dev/null +++ b/packages/misc/mwa_ppdb.xml @@ -0,0 +1,5 @@ + diff --git a/packages/misc/mwa_ppdb.zip b/packages/misc/mwa_ppdb.zip new file mode 100644 index 0000000..ae32aa7 Binary files /dev/null and b/packages/misc/mwa_ppdb.zip differ diff --git a/packages/misc/perluniprops.xml b/packages/misc/perluniprops.xml new file mode 100644 index 0000000..865bd5b --- /dev/null +++ b/packages/misc/perluniprops.xml @@ -0,0 +1,5 @@ + diff --git a/packages/misc/perluniprops.zip b/packages/misc/perluniprops.zip new file mode 100644 index 0000000..e1c3b74 Binary files /dev/null and b/packages/misc/perluniprops.zip differ diff --git a/packages/models/wmt15_eval.xml b/packages/models/wmt15_eval.xml new file mode 100644 index 0000000..42518e6 --- /dev/null +++ b/packages/models/wmt15_eval.xml @@ -0,0 +1,5 @@ + diff --git a/packages/models/wmt15_eval.zip b/packages/models/wmt15_eval.zip new file mode 100644 index 0000000..17fa083 Binary files /dev/null and b/packages/models/wmt15_eval.zip differ diff --git a/packages/sentiment/vader_lexicon.xml b/packages/sentiment/vader_lexicon.xml new file mode 100644 index 0000000..c805430 --- /dev/null +++ b/packages/sentiment/vader_lexicon.xml @@ -0,0 +1,6 @@ + diff --git a/packages/sentiment/vader_lexicon.zip b/packages/sentiment/vader_lexicon.zip new file mode 100644 index 0000000..c8d4b96 Binary files /dev/null and b/packages/sentiment/vader_lexicon.zip differ diff --git a/packages/stemmers/porter_test.xml b/packages/stemmers/porter_test.xml new file mode 100644 index 0000000..13e23fd --- /dev/null +++ b/packages/stemmers/porter_test.xml @@ -0,0 +1,4 @@ + diff --git a/packages/stemmers/porter_test.zip b/packages/stemmers/porter_test.zip new file mode 100644 index 0000000..7f4fb04 Binary files /dev/null and b/packages/stemmers/porter_test.zip differ diff --git a/packages/taggers/averaged_perceptron_tagger.xml b/packages/taggers/averaged_perceptron_tagger.xml new file mode 100644 index 0000000..bc07724 --- /dev/null +++ b/packages/taggers/averaged_perceptron_tagger.xml @@ -0,0 +1,7 @@ + diff --git a/packages/taggers/averaged_perceptron_tagger.zip b/packages/taggers/averaged_perceptron_tagger.zip new file mode 100644 index 0000000..d5bfb68 Binary files /dev/null and b/packages/taggers/averaged_perceptron_tagger.zip differ diff --git a/packages/taggers/averaged_perceptron_tagger_eng.xml b/packages/taggers/averaged_perceptron_tagger_eng.xml new file mode 100644 index 0000000..c3949c6 --- /dev/null +++ b/packages/taggers/averaged_perceptron_tagger_eng.xml @@ -0,0 +1,7 @@ + diff --git a/packages/taggers/averaged_perceptron_tagger_eng.zip b/packages/taggers/averaged_perceptron_tagger_eng.zip new file mode 100644 index 0000000..b792e19 Binary files /dev/null and b/packages/taggers/averaged_perceptron_tagger_eng.zip differ diff --git a/packages/taggers/averaged_perceptron_tagger_ru.xml b/packages/taggers/averaged_perceptron_tagger_ru.xml new file mode 100644 index 0000000..a399605 --- /dev/null +++ b/packages/taggers/averaged_perceptron_tagger_ru.xml @@ -0,0 +1,7 @@ + diff --git a/packages/taggers/averaged_perceptron_tagger_ru.zip b/packages/taggers/averaged_perceptron_tagger_ru.zip new file mode 100644 index 0000000..50e313e Binary files /dev/null and b/packages/taggers/averaged_perceptron_tagger_ru.zip differ diff --git a/packages/taggers/averaged_perceptron_tagger_rus.xml b/packages/taggers/averaged_perceptron_tagger_rus.xml new file mode 100644 index 0000000..6f9d436 --- /dev/null +++ b/packages/taggers/averaged_perceptron_tagger_rus.xml @@ -0,0 +1,7 @@ + diff --git a/packages/taggers/averaged_perceptron_tagger_rus.zip b/packages/taggers/averaged_perceptron_tagger_rus.zip new file mode 100644 index 0000000..e1bba62 Binary files /dev/null and b/packages/taggers/averaged_perceptron_tagger_rus.zip differ diff --git a/packages/taggers/hmm_treebank_pos_tagger.xml b/packages/taggers/hmm_treebank_pos_tagger.xml deleted file mode 100644 index 105d658..0000000 --- a/packages/taggers/hmm_treebank_pos_tagger.xml +++ /dev/null @@ -1,5 +0,0 @@ - diff --git a/packages/taggers/hmm_treebank_pos_tagger.zip b/packages/taggers/hmm_treebank_pos_tagger.zip deleted file mode 100644 index e2944e5..0000000 Binary files a/packages/taggers/hmm_treebank_pos_tagger.zip and /dev/null differ diff --git a/packages/taggers/maxent_treebank_pos_tagger_tab.xml b/packages/taggers/maxent_treebank_pos_tagger_tab.xml new file mode 100644 index 0000000..c70f85b --- /dev/null +++ b/packages/taggers/maxent_treebank_pos_tagger_tab.xml @@ -0,0 +1,5 @@ + diff --git a/packages/taggers/maxent_treebank_pos_tagger_tab.zip b/packages/taggers/maxent_treebank_pos_tagger_tab.zip new file mode 100644 index 0000000..5fb9e58 Binary files /dev/null and b/packages/taggers/maxent_treebank_pos_tagger_tab.zip differ diff --git a/packages/taggers/universal_tagset.xml b/packages/taggers/universal_tagset.xml index 5e3522a..36170bb 100644 --- a/packages/taggers/universal_tagset.xml +++ b/packages/taggers/universal_tagset.xml @@ -1,4 +1,7 @@ - diff --git a/packages/taggers/universal_tagset.zip b/packages/taggers/universal_tagset.zip index 5d4de64..543f8de 100644 Binary files a/packages/taggers/universal_tagset.zip and b/packages/taggers/universal_tagset.zip differ diff --git a/packages/tokenizers/punkt.xml b/packages/tokenizers/punkt.xml index b5b438e..0f25435 100644 --- a/packages/tokenizers/punkt.xml +++ b/packages/tokenizers/punkt.xml @@ -1,6 +1,6 @@ diff --git a/packages/tokenizers/punkt.zip b/packages/tokenizers/punkt.zip index 7291138..da7ffbd 100644 Binary files a/packages/tokenizers/punkt.zip and b/packages/tokenizers/punkt.zip differ diff --git a/packages/tokenizers/punkt_tab.xml b/packages/tokenizers/punkt_tab.xml new file mode 100644 index 0000000..feba7a3 --- /dev/null +++ b/packages/tokenizers/punkt_tab.xml @@ -0,0 +1,6 @@ + diff --git a/packages/tokenizers/punkt_tab.zip b/packages/tokenizers/punkt_tab.zip new file mode 100644 index 0000000..5e5ff61 Binary files /dev/null and b/packages/tokenizers/punkt_tab.zip differ diff --git a/tools/build_collections.py b/tools/build_collections.py new file mode 100644 index 0000000..f6d2027 --- /dev/null +++ b/tools/build_collections.py @@ -0,0 +1,51 @@ + +import os +import sys +from glob import glob +from typing import List +from xml.etree import ElementTree +from nltk.downloader import _indent_xml + +if len(sys.argv) != 2: + print("Usage: ") + print("build_collections.py ") + sys.exit(-1) + +ROOT = sys.argv[1] + +def write(file_name: str, coll_name: str, items: List[str]) -> None: + """Write `collection/{file_name}.xml` with `file_name` as the collection `id`, + `coll_name` as the collection `name`, and `items` as a list of collection items. + + :param file_name: The id of the collection, equivalent to the file name, + e.g. `all-corpora`. + :type file_name: str + :param coll_name: The name of the collection, e.g. `"All corpora"` + :type coll_name: str + :param items: A list of names for the collection items, e.g. `["abc", "alpino", ...]` + :type items: List[str] + """ + et = ElementTree.Element("collection", id=file_name, name=coll_name) + et.extend(ElementTree.Element("item", ref=item) for item in sorted(items)) + _indent_xml(et) + with open(os.path.join(ROOT, "collections", file_name + ".xml"), "w", encoding="utf8") as f: + f.write(ElementTree.tostring(et).decode("utf8")) + +def get_id(xml_path: str) -> str: + """Given a full path, extract only the filename (i.e. the nltk_data id) + + :param xml_path: A full path, e.g. "./packages/corpora/abc.xml" + :type xml_path: str + :return: The filename, without the extension, e.g. "abc" + :rtype: str + """ + return os.path.splitext(os.path.basename(xml_path))[0] + +# Write `collection/all-corpora.xml` based on all files under /packages/corpora +corpora_items = [get_id(xml_path) for xml_path in glob(f"{ROOT}/packages/corpora/*.xml")] +write("all-corpora", "All the corpora", corpora_items) + +# Write `collection/all-nltk.xml` and `collection/all.xml` based on all files under /packages +all_items = [get_id(xml_path) for xml_path in glob(f"{ROOT}/packages/**/*.xml")] +write("all-nltk", "All packages available on nltk_data gh-pages branch", all_items) +write("all", "All packages", all_items) \ No newline at end of file diff --git a/tools/build_pkg_index.py b/tools/build_pkg_index.py index 6aeeaa7..d84cfed 100755 --- a/tools/build_pkg_index.py +++ b/tools/build_pkg_index.py @@ -27,5 +27,6 @@ out = open(OUT, 'w') out.write(xml_header) out.write(s) +out.write('\n') out.close()