Coverage for src/crawler/by_source/ipb_crawler.py: 95%

51 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import json 

2from urllib.parse import parse_qs, urlencode, urljoin, urlparse 

3 

4from ptf.model_data import ( 

5 IssueData, 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_extlink, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14 

15 

16class IpbCrawler(BaseCollectionCrawler): 

17 source_name = "International Press of Boston" 

18 source_domain = "IPB" 

19 source_website = "https://link.intlpress.com/" 

20 ignore_missing_pdf = False 

21 

22 def parse_collection_content(self, content): 

23 collection_urlparsed = urlparse(self.collection_url) 

24 collection_query = parse_qs(collection_urlparsed.query) 

25 

26 xissues = [] 

27 data = json.loads(content) 

28 for volume in data["data"]["volumes"]: 

29 for issue in volume["issues"]: 

30 issue_query = {"periodicalId": collection_query["id"][0], "issueId": issue["id"]} 

31 issue_url = ( 

32 "https://link.intlpress.com/api/bgcloud-front/periodical/issue-content-list?" 

33 + urlencode(issue_query) 

34 ) 

35 

36 xissues.append( 

37 self.create_xissue( 

38 issue_url, 

39 str(volume["fYear"]), 

40 str(volume["volumeNum"]), 

41 str(issue["issueNum"]), 

42 ) 

43 ) 

44 return xissues 

45 

46 def parse_issue_content(self, content, xissue): 

47 data = json.loads(content) 

48 for index, article_dict in enumerate(data["data"]["content"]): 

49 xissue.articles.append(self.parse_ipb_article(article_dict, xissue, index)) 

50 

51 def parse_ipb_article(self, article_dict: dict, xissue: IssueData, index: int): 

52 if not xissue.pid: 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 raise ValueError("You must set xissue.pid before parsing an article") 

54 if not xissue.url: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError("You must set xissue.url before parsing an article") 

56 

57 xarticle = create_articledata() 

58 xarticle.lang = "en" 

59 xarticle.pid = xissue.pid + "_a" + str(index) 

60 xarticle.doi = article_dict["doi"].removeprefix("https://dx.doi.org/") 

61 if xarticle.doi == "": 

62 xarticle.doi = None 

63 

64 ext_link = create_extlink( 

65 rel="source", 

66 location=urljoin("https://link.intlpress.com/JDetail/", article_dict["id"]), 

67 metadata=self.source_domain, 

68 ) 

69 xarticle.ext_links.append(ext_link) 

70 xarticle.title_tex = article_dict["title"] 

71 xarticle.fpage = article_dict["startPage"] or "" 

72 xarticle.lpage = article_dict["endPage"] or "" 

73 

74 if article_dict["keyword"] != "": 

75 for kwd in article_dict["keyword"].split(","): 

76 xarticle.kwds.append(create_subj(type="kwd", lang="en", value=kwd)) 

77 

78 if article_dict["mscValue"] != "": 

79 for msc in article_dict["mscValue"].split(","): 

80 xarticle.kwds.append(create_subj(type="msc", lang="en", value=msc)) 

81 

82 if article_dict["fAbstract"] != "": 

83 xarticle.abstracts.append( 

84 create_abstract(lang="en", tag="abstract", value_tex=article_dict["fAbstract"]) 

85 ) 

86 

87 for author in article_dict["authors"]: 

88 xarticle.contributors.append( 

89 create_contributor( 

90 string_name=author["fullName"], role="author", orcid=author["orcid"] 

91 ) 

92 ) 

93 return xarticle