prepare_trex.py - gbure - Unnamed repository; edit this file 'description' to name the repository.

prepare_trex.py (3335B)
      1 from typing import Any, Dict, Iterable, List, Optional, Tuple
      2 import argparse
      3 import json
      4 import os
      5 import pathlib
      6 import tqdm
      7 
      8 from gbure.utils import DATA_PATH
      9 import gbure.data.preprocessing as preprocessing
     10 
     11 DATASET_PATH: pathlib.Path = DATA_PATH / "T-REx"
     12 DIRECTORY_NAME: str = "raw_data"
     13 ARCHIVE_NAME: str = f"T-REx.zip"
     14 ARCHIVE_SHA512: str = "30349fa6f01c1928ce15325521ebd05643787220f9a545eb23b280f9209cb1615f4a855b08604f943a1affb4d1f4f17b94f8434698f347a1cb7a0d820fa9de9f"
     15 DOWNLOAD_URL: str = f"https://esimon.eu/GBURE/{ARCHIVE_NAME}"
     16 
     17 
     18 def process_json_object(data: List[Dict[str, Any]]) -> Iterable[Tuple[str, List[Tuple[str, int, int]]]]:
     19     """ Process a T-REx json object and return (sentence, list of entities) tuples. """
     20     for article in data:
     21         eid: int = 0
     22         for sbs in article["sentences_boundaries"]:
     23             entities: List[Tuple[str, int, int]] = []
     24             while eid < len(article["entities"]) and article["entities"][eid]["boundaries"][0] < sbs[0]:
     25                 eid += 1
     26 
     27             while eid < len(article["entities"]) and article["entities"][eid]["boundaries"][1] <= sbs[1]:
     28                 entity: Dict[str, Any] = article["entities"][eid]
     29                 eid += 1
     30 
     31                 # Ignore date entities
     32                 if entity["annotator"] != "Wikidata_Spotlight_Entity_Linker":
     33                     continue
     34 
     35                 uri: str = entity["uri"]
     36                 prefix: str = "http://www.wikidata.org/entity/Q"
     37                 assert(uri.startswith(prefix))
     38                 uri = uri[len(prefix):]
     39                 entities.append((uri, entity["boundaries"][0] - sbs[0], entity["boundaries"][1] - sbs[0]))
     40 
     41             # ignore sentences with less than two entities
     42             if len(entities) < 2:
     43                 continue
     44 
     45             sentence = article["text"][sbs[0]:sbs[1]]
     46             yield (sentence, entities)
     47 
     48 
     49 def read_data(subset: Optional[int]) -> Iterable[Tuple[str, List[Tuple[str, int, int]]]]:
     50     """ Read all T-REx files and return (sentence, list of entities) tuples. """
     51     filenames: List[str] = list(filter(lambda filename: filename.endswith(".json"), os.listdir(DATASET_PATH / DIRECTORY_NAME)))
     52 
     53     # Make the order deterministic.
     54     filenames.sort()
     55     if subset is not None:
     56         filenames = filenames[:subset]
     57 
     58     for filename in tqdm.tqdm(filenames, desc="loading"):
     59         with open(DATASET_PATH / DIRECTORY_NAME / filename, "r") as file:
     60             data: List[Dict[str, Any]] = json.load(file)
     61         yield from process_json_object(data)
     62 
     63 
     64 if __name__ == "__main__":
     65     parser: argparse.ArgumentParser = preprocessing.base_argument_parser("Prepare the unsupervised TREx dataset.")
     66     parser.add_argument("-S", "--subset",
     67                         type=int,
     68                         help="Number of file to process (default to all, only used for creating a debug dataset)")
     69     args: argparse.Namespace = parser.parse_args()
     70     name: str = preprocessing.dataset_name(args, "" if args.subset is None else f"-ss{args.subset}")
     71 
     72     preprocessing.get_zip_data(DATASET_PATH, DIRECTORY_NAME, ARCHIVE_NAME, ARCHIVE_SHA512, DOWNLOAD_URL, unzip_directory=True)
     73     preprocessing.serialize_unsupervised_dataset(
     74             path=DATASET_PATH / name,
     75             data=read_data(args.subset),
     76             **preprocessing.args_to_serialize(args))
	gbure Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE