Coverage for src/crawler/by_source/ipb_crawler.py: 95%
50 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import json
2from urllib.parse import parse_qs, urlencode, urljoin, urlparse
4from ptf.model_data import (
5 IssueData,
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_extlink,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
16class IpbCrawler(BaseCollectionCrawler):
17 source_name = "International Press of Boston"
18 source_domain = "IPB"
19 source_website = "https://link.intlpress.com/"
21 def parse_collection_content(self, content):
22 collection_urlparsed = urlparse(self.collection_url)
23 collection_query = parse_qs(collection_urlparsed.query)
25 xissues = []
26 data = json.loads(content)
27 for volume in data["data"]["volumes"]:
28 for issue in volume["issues"]:
29 issue_query = {"periodicalId": collection_query["id"][0], "issueId": issue["id"]}
30 issue_url = (
31 "https://link.intlpress.com/api/bgcloud-front/periodical/issue-content-list?"
32 + urlencode(issue_query)
33 )
35 xissues.append(
36 self.create_xissue(
37 issue_url,
38 str(volume["fYear"]),
39 str(volume["volumeNum"]),
40 str(issue["issueNum"]),
41 )
42 )
43 return xissues
45 def parse_issue_content(self, content, xissue):
46 data = json.loads(content)
47 for index, article_dict in enumerate(data["data"]["content"]):
48 xissue.articles.append(self.parse_ipb_article(article_dict, xissue, index))
50 def parse_ipb_article(self, article_dict: dict, xissue: IssueData, index: int):
51 if not xissue.pid: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError("You must set xissue.pid before parsing an article")
53 if not xissue.url: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 raise ValueError("You must set xissue.url before parsing an article")
56 xarticle = create_articledata()
57 xarticle.lang = "en"
58 xarticle.pid = xissue.pid + "_a" + str(index)
59 xarticle.doi = article_dict["doi"].removeprefix("https://dx.doi.org/")
60 if xarticle.doi == "":
61 xarticle.doi = None
63 ext_link = create_extlink(
64 rel="source",
65 location=urljoin("https://link.intlpress.com/JDetail/", article_dict["id"]),
66 metadata=self.source_domain,
67 )
68 xarticle.ext_links.append(ext_link)
69 xarticle.title_tex = article_dict["title"]
70 xarticle.fpage = article_dict["startPage"] or ""
71 xarticle.lpage = article_dict["endPage"] or ""
73 if article_dict["keyword"] != "":
74 for kwd in article_dict["keyword"].split(","):
75 xarticle.kwds.append(create_subj(type="kwd", lang="en", value=kwd))
77 if article_dict["mscValue"] != "":
78 for msc in article_dict["mscValue"].split(","):
79 xarticle.kwds.append(create_subj(type="msc", lang="en", value=msc))
81 if article_dict["fAbstract"] != "":
82 xarticle.abstracts.append(
83 create_abstract(lang="en", tag="abstract", value_tex=article_dict["fAbstract"])
84 )
86 for author in article_dict["authors"]:
87 xarticle.contributors.append(
88 create_contributor(
89 string_name=author["fullName"], role="author", orcid=author["orcid"]
90 )
91 )
92 return xarticle