twitter_crawl.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. from twikit.client.client import Client
  2. from server.spider.twitter_db import DatabaseHandler
  3. class TwitterCrawler:
  4. def __init__(self, client: Client, db_handler: DatabaseHandler):
  5. self.client = client
  6. self.db_handler = db_handler
  7. async def crawl_user_tweets(self, user_id: str):
  8. for tweet_type in ['Tweets', 'Replies', 'Media']:
  9. latest_cursor = self.db_handler.get_latest_twitter_id(user_id, tweet_type)
  10. while True:
  11. # 请求获取当前类型的推文数据
  12. result = await self.client.get_user_tweets(user_id, tweet_type, count=40, cursor=latest_cursor)
  13. print(result)
  14. # 处理当前请求返回的推文
  15. for tweet in result:
  16. # 保存推文内容到数据库
  17. await self.db_handler.save_tweet(tweet, tweet_type, latest_cursor)
  18. # 如果存在下一页数据,更新游标继续获取数据
  19. if result.next_cursor:
  20. latest_cursor = result.next_cursor
  21. else:
  22. break # 如果没有下一页数据,停止爬取
  23. async def crawl_user(self, user_name: str):
  24. # 调用获取用户数据的方法
  25. result = await self.client.get_user_by_screen_name(user_name)
  26. if result:
  27. # 提取用户的关键信息
  28. user_data = {
  29. 'id': result.id,
  30. 'name': result.name,
  31. 'screen_name': result.screen_name,
  32. 'profile_image_url': result.profile_image_url,
  33. 'profile_banner_url': result.profile_banner_url,
  34. 'url': result.url,
  35. 'location': result.location,
  36. 'description': result.description,
  37. 'is_blue_verified': result.is_blue_verified,
  38. 'verified': result.verified,
  39. 'possibly_sensitive': result.possibly_sensitive,
  40. 'can_dm': result.can_dm,
  41. 'can_media_tag': result.can_media_tag,
  42. 'want_retweets': result.want_retweets,
  43. 'default_profile': result.default_profile,
  44. 'default_profile_image': result.default_profile_image,
  45. 'followers_count': result.followers_count,
  46. 'fast_followers_count': result.fast_followers_count,
  47. 'normal_followers_count': result.normal_followers_count,
  48. 'following_count': result.following_count,
  49. 'favourites_count': result.favourites_count,
  50. 'listed_count': result.listed_count,
  51. 'media_count': result.media_count,
  52. 'statuses_count': result.statuses_count,
  53. 'is_translator': result.is_translator,
  54. 'translator_type': result.translator_type,
  55. 'profile_interstitial_type': None,
  56. 'withheld_in_countries': None
  57. }
  58. # 将数据保存到数据库
  59. await self.db_handler.save_user(user_data)
  60. print(f"User data for {user_name} saved successfully.")