Coverage for src/crawler/by_source/ipb_crawler.py: 95%
51 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import json
2from urllib.parse import parse_qs, urlencode, urljoin, urlparse
4from ptf.model_data import (
5 IssueData,
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_extlink,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
16class IpbCrawler(BaseCollectionCrawler):
17 source_name = "International Press of Boston"
18 source_domain = "IPB"
19 source_website = "https://link.intlpress.com/"
20 ignore_missing_pdf = False
22 def parse_collection_content(self, content):
23 collection_urlparsed = urlparse(self.collection_url)
24 collection_query = parse_qs(collection_urlparsed.query)
26 xissues = []
27 data = json.loads(content)
28 for volume in data["data"]["volumes"]:
29 for issue in volume["issues"]:
30 issue_query = {"periodicalId": collection_query["id"][0], "issueId": issue["id"]}
31 issue_url = (
32 "https://link.intlpress.com/api/bgcloud-front/periodical/issue-content-list?"
33 + urlencode(issue_query)
34 )
36 xissues.append(
37 self.create_xissue(
38 issue_url,
39 str(volume["fYear"]),
40 str(volume["volumeNum"]),
41 str(issue["issueNum"]),
42 )
43 )
44 return xissues
46 def parse_issue_content(self, content, xissue):
47 data = json.loads(content)
48 for index, article_dict in enumerate(data["data"]["content"]):
49 xissue.articles.append(self.parse_ipb_article(article_dict, xissue, index))
51 def parse_ipb_article(self, article_dict: dict, xissue: IssueData, index: int):
52 if not xissue.pid: 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true
53 raise ValueError("You must set xissue.pid before parsing an article")
54 if not xissue.url: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 raise ValueError("You must set xissue.url before parsing an article")
57 xarticle = create_articledata()
58 xarticle.lang = "en"
59 xarticle.pid = xissue.pid + "_a" + str(index)
60 xarticle.doi = article_dict["doi"].removeprefix("https://dx.doi.org/")
61 if xarticle.doi == "":
62 xarticle.doi = None
64 ext_link = create_extlink(
65 rel="source",
66 location=urljoin("https://link.intlpress.com/JDetail/", article_dict["id"]),
67 metadata=self.source_domain,
68 )
69 xarticle.ext_links.append(ext_link)
70 xarticle.title_tex = article_dict["title"]
71 xarticle.fpage = article_dict["startPage"] or ""
72 xarticle.lpage = article_dict["endPage"] or ""
74 if article_dict["keyword"] != "":
75 for kwd in article_dict["keyword"].split(","):
76 xarticle.kwds.append(create_subj(type="kwd", lang="en", value=kwd))
78 if article_dict["mscValue"] != "":
79 for msc in article_dict["mscValue"].split(","):
80 xarticle.kwds.append(create_subj(type="msc", lang="en", value=msc))
82 if article_dict["fAbstract"] != "":
83 xarticle.abstracts.append(
84 create_abstract(lang="en", tag="abstract", value_tex=article_dict["fAbstract"])
85 )
87 for author in article_dict["authors"]:
88 xarticle.contributors.append(
89 create_contributor(
90 string_name=author["fullName"], role="author", orcid=author["orcid"]
91 )
92 )
93 return xarticle