#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from mardi_importer.importer.Importer import ADataSource, ImporterException
from mardi_importer.integrator import MardiIntegrator
from .RPackage import RPackage
import pandas as pd
import time
import json
import os
import logging
log = logging.getLogger('CRANlogger')
[docs]
class CRANSource(ADataSource):
"""Processes data from the Comprehensive R Archive Network.
Metadata for each R package is scrapped from the CRAN Repository. Each
Wikibase item corresponding to each R package is subsequently updated
or created, in case of a new package.
Attributes:
packages (Pandas dataframe):
Dataframe with **package name**, **title** and **date of publication** for
each package in CRAN.
"""
def __init__(self):
self.integrator = MardiIntegrator()
self.filepath = os.path.realpath(os.path.dirname(__file__))
self.packages = ""
[docs]
def setup(self):
"""Create all necessary properties and entities for CRAN
"""
# Import entities from Wikidata
filename = self.filepath + "/wikidata_entities.txt"
self.integrator.import_entities(filename=filename)
# Create new required local entities
self.create_local_entities()
[docs]
def create_local_entities(self):
filename = self.filepath + "/new_entities.json"
f = open(filename)
entities = json.load(f)
for prop_element in entities['properties']:
prop = self.integrator.property.new()
prop.labels.set(language='en', value=prop_element['label'])
prop.descriptions.set(language='en', value=prop_element['description'])
prop.datatype = prop_element['datatype']
if not prop.exists(): prop.write()
for item_element in entities['items']:
item = self.integrator.item.new()
item.labels.set(language='en', value=item_element['label'])
item.descriptions.set(language='en', value=item_element['description'])
for key, value in item_element['claims'].items():
item.add_claim(key,value=value)
if not item.exists(): item.write()
[docs]
def pull(self):
"""Reads **date**, **package name** and **title** from the CRAN Repository URL.
The result is saved as a pandas dataframe in the attribute **packages**.
Returns:
Pandas dataframe: Attribute ``packages``
Raises:
ImporterException: If table at the CRAN url cannot be accessed or read.
"""
url = r"https://cran.r-project.org/web/packages/available_packages_by_date.html"
try:
tables = pd.read_html(url) # Returns list of all tables on page
except Exception as e:
raise ImporterException(
"Error attempting to read table from CRAN url\n{}".format(e)
)
else:
self.packages = tables[0]
return self.packages
[docs]
def push(self):
"""Updates the MaRDI Wikibase entities corresponding to R packages.
For each **package name** in the attribute **packages** checks
if the date in CRAN coincides with the date in the MaRDI
knowledge graph. If not, the package is updated. If the package
is not found in the MaRDI knowledge graph, the corresponding
item is created.
It creates a :class:`mardi_importer.cran.RPackage` instance
for each package.
"""
# Limit the query to only 30 packages (Comment next line to process data on all ~19000 packages)
#self.packages = self.packages.loc[:100, :]
flag = False
for _, row in self.packages.iterrows():
package_date = row["Date"]
package_label = row["Package"]
package_title = row["Title"]
#if not flag and package_label != "BeSS":
# continue
#flag = True
#if package_label == "GeoModels":
package = RPackage(package_date, package_label, package_title, self.integrator)
if package.exists():
if not package.is_updated():
print(f"Package {package_label} found: Not up to date. Attempting update...")
package.update()
else:
print(f"Package {package_label} found: Already up to date.")
else:
print(f"Package {package_label} not found: Attempting item creation...")
package.create()
time.sleep(2)