2017-06-29 4 views
1

Voici le code:Je reçois l'erreur avec mes codes pour lire des ensembles de données twitter de JSON en utilisant python 3.6.1

import json 
import re 

emoticons_str = r""" 
    (?: 
     [:=;] # Eyes 
     [oO\-]? # Nose (optional) 
     [D\)\]\(\]/\\OpP] # Mouth 
    )""" 

regex_str = [ 
    emoticons_str, 
    r'<[^>]+>', # HTML tags 
    r'(?:@[\w_]+)', # @-mentions 
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags 
    r'http[s]?://(?:[a-z]|[0-9]|[[email protected]&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs 

    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers 
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' 
    r'(?:[\w_]+)', # other words 
    r'(?:\S)' # anything else 
] 

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE) 
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE) 

def tokenize(s): 
    return tokens_re.findall(s) 

def preprocess(s, lowercase=False): 
    tokens = tokenize(s) 
    if lowercase: 
     tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens] 
    return tokens 
with open('mytweets.json', mode='r', encoding='utf-8') as f: 
    for line in f: 
     #line = f.readline() 
     tweet = json.loads(line) 
    print(preprocess(tweet['text'])) 

Après l'exécution indiquant les problèmes: Getting the problem after running the codes

Quelle est la solution du problème? Comment je peux lire avec succès les données et tokenize les tweets de format json?

Voici quelques exemples de mytweets.json

{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007261674602496,"id_str":"878007261674602496","text":"RT @wreckitroy: Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try t\u2026 ","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":632645991,"id_str":"632645991","name":"meche","screen_name":"mercedessreyes","location":null,"url":null,"description":"I mean, really it's same me, it's old me \u2022 FSU '21 \u2022 https:\/\/vsco.co\/onlymeche","protected":false,"verified":false,"followers_count":1039,"friends_count":352,"listed_count":6,"favourites_count":21860,"statuses_count":21676,"created_at":"Wed Jul 11 04:06:28 +0000 2012","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FCEBB6","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/762423763\/6c7d56ca20260816f75c10759208b283.png","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/762423763\/6c7d56ca20260816f75c10759208b283.png","profile_background_tile":true,"profile_link_color":"CE7834","profile_sidebar_border_color":"F0A830","profile_sidebar_fill_color":"78C0A8","profile_text_color":"5E412F","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/876886584087502848\/9WSQDm8F_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/876886584087502848\/9WSQDm8F_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/632645991\/1497147929","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Jun 21 02:57:42 +0000 2017","id":877359845074018304,"id_str":"877359845074018304","text":"Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try t\u2026 https:\/\/t.co\/lUJzY60Sn8","display_text_range":[0,140],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2341390003,"id_str":"2341390003","name":"roy","screen_name":"wreckitroy","location":"Fresno, CA","url":null,"description":"She said I'm looking like a bad man, smooth criminal. \ud83c\udf43 \/ snapchat\/instagram: thericharrow","protected":false,"verified":false,"followers_count":4831,"friends_count":1103,"listed_count":23,"favourites_count":79829,"statuses_count":1012,"created_at":"Thu Feb 13 04:30:59 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/876941549874978816\/eTGFmh8u_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/876941549874978816\/eTGFmh8u_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/2341390003\/1498157548","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877359034621468672,"quoted_status_id_str":"877359034621468672","quoted_status":{"created_at":"Wed Jun 21 02:54:29 +0000 2017","id":877359034621468672,"id_str":"877359034621468672","text":"When you trying so hard to getvout the friend zone\ud83d\ude02\ud83d\ude02 https:\/\/t.co\/i8yFNbGDNn","display_text_range":[0,52],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":844510650,"id_str":"844510650","name":"\u3164","screen_name":"DaddyGunPlay","location":null,"url":null,"description":"One of the best Contoller players dont @. Bo2 is surperior #JellyFam\ud83c\udf47","protected":false,"verified":false,"followers_count":325,"friends_count":276,"listed_count":3,"favourites_count":1795,"statuses_count":5009,"created_at":"Mon Sep 24 23:51:03 +0000 2012","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"3B94D9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/844510650\/1496174936","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877210813462740992,"quoted_status_id_str":"877210813462740992","is_quote_status":true,"retweet_count":45,"favorite_count":138,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/i8yFNbGDNn","expanded_url":"https:\/\/twitter.com\/wreckitroy\/status\/877210813462740992","display_url":"twitter.com\/wreckitroy\/sta\u2026","indices":[53,76]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"extended_tweet":{"full_text":"Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try to find the truth. \ud83d\ude09 https:\/\/t.co\/fv4Kqvv2sb","display_text_range":[0,134],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/fv4Kqvv2sb","expanded_url":"https:\/\/twitter.com\/daddygunplay\/status\/877359034621468672","display_url":"twitter.com\/daddygunplay\/s\u2026","indices":[135,158]}],"user_mentions":[],"symbols":[]}},"retweet_count":2496,"favorite_count":12594,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/lUJzY60Sn8","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/877359845074018304","display_url":"twitter.com\/i\/web\/status\/8\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"quoted_status_id":877359034621468672,"quoted_status_id_str":"877359034621468672","quoted_status":{"created_at":"Wed Jun 21 02:54:29 +0000 2017","id":877359034621468672,"id_str":"877359034621468672","text":"When you trying so hard to getvout the friend zone\ud83d\ude02\ud83d\ude02 https:\/\/t.co\/i8yFNbGDNn","display_text_range":[0,52],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":844510650,"id_str":"844510650","name":"\u3164","screen_name":"DaddyGunPlay","location":null,"url":null,"description":"One of the best Contoller players dont @. Bo2 is surperior #JellyFam\ud83c\udf47","protected":false,"verified":false,"followers_count":325,"friends_count":276,"listed_count":3,"favourites_count":1795,"statuses_count":5009,"created_at":"Mon Sep 24 23:51:03 +0000 2012","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"3B94D9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/874005327045414913\/NUPA2rvD_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/844510650\/1496174936","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877210813462740992,"quoted_status_id_str":"877210813462740992","is_quote_status":true,"retweet_count":45,"favorite_count":138,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/i8yFNbGDNn","expanded_url":"https:\/\/twitter.com\/wreckitroy\/status\/877210813462740992","display_url":"twitter.com\/wreckitroy\/sta\u2026","indices":[53,76]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"","expanded_url":null,"indices":[133,133]}],"user_mentions":[{"screen_name":"wreckitroy","name":"roy","id":2341390003,"id_str":"2341390003","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218426"} 

{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007262320754692,"id_str":"878007262320754692","text":"It makes me feel some type of way now bree got another lil boy friend","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":47587983,"id_str":"47587983","name":"Kee Gotti","screen_name":"_BadGalKee","location":"Columbus, OH","url":null,"description":"\u2022 Instagram|_badgalkee \u2022 SnapChat| kbabiy","protected":false,"verified":false,"followers_count":1107,"friends_count":639,"listed_count":12,"favourites_count":1160,"statuses_count":28359,"created_at":"Tue Jun 16 09:46:12 +0000 2009","utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/850590447261167616\/MuywFrn8_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/850590447261167616\/MuywFrn8_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/47587983\/1487216863","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218580"} 

{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007263310393344,"id_str":"878007263310393344","text":"I liked a @YouTube video https:\/\/t.co\/Znu4govqDi My Friend is in LOVE ...","source":"\u003ca href=\"http:\/\/www.google.com\/\" rel=\"nofollow\"\u003eGoogle\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":42287518,"id_str":"42287518","name":"David","screen_name":"iceman120","location":"FT LAUDERDALE, FL","url":"http:\/\/www.youtube.com\/iceman120dl","description":"\ue10e\ue10eOH YOU WANT SOME OF THIS\ue12f\ue12f\ue12f\ue12f\ue10e\ue10e","protected":false,"verified":false,"followers_count":4667,"friends_count":361,"listed_count":69,"favourites_count":134,"statuses_count":69716,"created_at":"Sun May 24 21:43:04 +0000 2009","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/53704022\/ahamericanflag72.br.jpg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/53704022\/ahamericanflag72.br.jpg","profile_background_tile":false,"profile_link_color":"D60000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"1C1939","profile_text_color":"777777","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/511261204120363008\/DuNoXOXB_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/511261204120363008\/DuNoXOXB_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/42287518\/1375147278","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/Znu4govqDi","expanded_url":"http:\/\/youtu.be\/up6u1hzWHHc?a","display_url":"youtu.be\/up6u1hzWHHc?a","indices":[25,48]}],"user_mentions":[{"screen_name":"YouTube","name":"YouTube","id":10228272,"id_str":"10228272","indices":[10,18]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218816"} 
+0

Pouvez-vous publier les données json que vous utilisez? – NieDzejkob

+0

Pouvez-vous s'il vous plaît ** coller ** le retraçage, nous préférons tous le texte + il semble que vos données json ne sont pas bien formatées – Arount

+0

J'ai ajouté quelques exemples de mes jeux de données @NieDzejkob –

Répondre

0

Vous avez posté des échantillons, et pour autant que je vous vois juste besoin de sauter des lignes vides.

RÉPONSE VIEUX-DESSOUS

Vous devez analyser JSON cette façon:

... 
with open('mytweets.json', mode='r', encoding='utf-8') as f: 
    tweet = json.load(f) 
    ... 

json.load() accepte un file-like object comme premier argument. Ce que vous essayez actuellement de faire est de lire le fichier ligne par ligne et d'essayer d'analyser chaque ligne comme une chaîne JSON séparée, et le fichier semble être formaté, de sorte que vous n'avez pas complet json dans aucune ligne.

Vous pouvez parcourir la liste des tweets de votre fichier (si ma réponse est correcte), pas les lignes de texte et appeler print(preprocess()) dans la boucle.

+0

Merci.Mais j'ai toujours cette erreur Fichier: ligne 39, en impression (prétraiter (tweet ['texte'])) UnicodeEncodeError: codec 'UCS-2' ne peut pas encoder les caractères dans la position 132-132: Caractère non-BMP non supporté dans Tk @bakatrouble –