import nltk from nltk.tokenize import word_tokenize, sent_tokenize text = "Python是一种功能强大的编程语言。它也被称为最易学习的编程语言之一。Python常常用于Web开发、数据分析、人工智能等领域。然而,Python也有一些缺点。" sentences = sent_tokenize(text)
def extract_features(sentence): features = {} tokens = word_tokenize(sentence) pos_tags = nltk.pos_tag(tokens) features["word_count"] = len(tokens) features["verb_count"] = sum(1 for word, pos in pos_tags if pos.startswith('V')) features["adjective_count"] = sum(1 for word, pos in pos_tags if pos.startswith('JJ')) features["noun_count"] = sum(1 for word, pos in pos_tags if pos.startswith('NN')) return features training_data = [(extract_features(sentence), "simple" if "," not in sentence and "and" not in sentence else "complex" if "," in sentence and "and" not in sentence else "compound") for sentence in sentences]
classifier = nltk.NaiveBayesClassifier.train(training_data)
test_sentence = "Python经常被用于数据分析和机器学习。" test_features = extract_features(test_sentence) print(classifier.classify(test_features)) # Output: 'compound'
import nltk from nltk.tokenize import word_tokenize, sent_tokenize def extract_features(sentence): features = {} tokens = word_tokenize(sentence) pos_tags = nltk.pos_tag(tokens) features["word_count"] = len(tokens) features["verb_count"] = sum(1 for word, pos in pos_tags if pos.startswith('V')) features["adjective_count"] = sum(1 for word, pos in pos_tags if pos.startswith('JJ')) features["noun_count"] = sum(1 for word, pos in pos_tags if pos.startswith('NN')) return features text = "Python是一种功能强大的编程语言。它也被称为最易学习的编程语言之一。Python常常用于Web开发、数据分析、人工智能等领域。然而,Python也有一些缺点。" sentences = sent_tokenize(text) training_data = [(extract_features(sentence), "simple" if "," not in sentence and "and" not in sentence else "complex" if "," in sentence and "and" not in sentence else "compound") for sentence in sentences] classifier = nltk.NaiveBayesClassifier.train(training_data) test_sentence = "Python经常被用于数据分析和机器学习。" test_features = extract_features(test_sentence) print(classifier.classify(test_features)) # Output: 'compound'
标签: 域名备案