Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
120 lines (92 sloc) 3.16 KB
import csv
def check_duplicate_entries(submission) -> bool:
"""
are any tweet_ids present multiple times
:param submission:
:return: True if there are duplicate tweet_ids, False otherwise
"""
return (len(submission) - len({d['tweet_id'] for d in submission})) != 0
def check_submission_contains_all_tweets(submission, test):
submitted_ids = {d['tweet_id'] for d in submission}
test_ids = {d['tweet_id'] for d in test}
missing = {
tweet_id
for tweet_id in test_ids
if tweet_id not in submitted_ids
}
not_in_test = {
tweet_id
for tweet_id in submitted_ids
if tweet_id not in test_ids
}
return {
'missing': missing,
'not_in_test': not_in_test,
}
def check_labels(submission):
assigned_labels = {d['label'] for d in submission}
weird_labels = {
label
for label in assigned_labels
if label not in {'gsw', 'not_gsw'}
}
return weird_labels
def check_scores(submission):
count = 0
for d in submission:
try:
_ = float(d['score'])
except ValueError:
count += 1
return count
def check(fname):
ok = True
with open(fname, 'r') as fin:
reader = csv.DictReader(fin)
if 'tweet_id' not in reader.fieldnames:
print(f"column 'tweet_id' missing, found columns: {reader.fieldnames}")
ok = False
if 'label' not in reader.fieldnames:
print(f"column 'label' missing, found columns: {reader.fieldnames}")
ok = False
if 'tweet_id' not in reader.fieldnames:
print(f"column 'score' missing, found columns: {reader.fieldnames}")
ok = False
submission = list(reader)
with open('./data/test_tweets.csv', 'r') as fin:
reader = csv.DictReader(fin)
test_tweets = list(reader)
if not ok:
return
if check_duplicate_entries(submission):
print('Duplicate entries:\tthere are duplicate tweet_ids')
ok = False
completeness = check_submission_contains_all_tweets(submission, test_tweets)
if len(completeness['missing']) != 0:
print(f"Missing entries:\t{len(completeness['missing'])} tweet_ids are missing")
ok = False
if len(completeness['not_in_test']) != 0:
print(f"Found {completeness['not_in_test']} tweet_ids that don't appear in test set")
ok = False
weird_labels = check_labels(submission)
if len(weird_labels) > 0:
print(f"Found unexpected labels: {weird_labels}, use 'gsw' and 'not_gsw'")
ok = False
unparseable_scores = check_scores(submission)
if unparseable_scores > 0:
print(f"could not parse {unparseable_scores} scores")
ok = False
print('*' * 80)
if ok:
print("Submission looks ok!")
else:
print("There are problems with your submission.")
print('*' * 80)
if __name__ == '__main__':
import sys
try:
submission_file = sys.argv[1]
check(submission_file)
except IndexError:
print("path to submission file missing")
print("usage: python -m check_submission /path/to/your/submission.csv")
You can’t perform that action at this time.